Exemplo n.º 1
0
def run(
    departement,
    prediction_beginning_date=None,
    return_df_etab_if_successful=False,
):
    """
    Returns True if computation successful and False otherwise.
    """
    if prediction_beginning_date is None:
        prediction_beginning_date = compute_prediction_beginning_date()
    check_prediction_beginning_date(prediction_beginning_date)
    # get df_etab with monthly hirings for each hiring_type
    # hirings are not yet grouped by period (i.e. 6 months for DPAE / 6 months for Alternance)
    df_etab = get_df_etab_with_hiring_monthly_aggregates(
        departement, prediction_beginning_date)
    logger.debug(
        "df_etab_with_hiring_monthly_aggregates loaded for departement %s",
        departement)

    if df_etab is None:
        logger.warning("no etab/hiring data found for departement %s",
                       departement)
        return False  # failed computation (e.g. no data)

    # LBB / DPAE.

    train(
        df_etab,
        departement,
        prediction_beginning_date=prediction_beginning_date,
        last_historical_data_date=go_back_last_day_of_the_month(
            DpaeStatistics.get_last_historical_data_date(DpaeStatistics.DPAE)),
        months_per_period=6,
        training_periods=7,
        prefix_for_fields='dpae',
        score_field_name='score',
    )
    logger.debug("DPAE/LBB training done for departement %s", departement)

    # LBA / Alternance.
    train(df_etab,
          departement,
          prediction_beginning_date=prediction_beginning_date,
          last_historical_data_date=go_back_last_day_of_the_month(
              DpaeStatistics.get_last_historical_data_date(
                  DpaeStatistics.APR)),
          months_per_period=6,
          training_periods=7,
          prefix_for_fields='alt',
          score_field_name='score_alternance',
          is_lbb=False)
    logger.debug("Alternance/LBA training done for departement %s",
                 departement)

    # Final data export.

    export_df_etab_to_db(df_etab, departement)
    if return_df_etab_if_successful:
        return df_etab  # only used in test_compute_score.py for inspection
    return True  # successful computation
Exemplo n.º 2
0
    def run(self):
        """
        Tricky parallelization.
        In some cases the jobs take a long time to run, so there's COMPUTE_SCORE_TIMEOUT for that.
        If a job was not finished before the timeout, it will be added to a list to be processed sequentially.
        """
        results = []
        pool = multiprocessing.Pool(processes=8, maxtasksperchild=1)
        async_results = {}
        most_recent_data_date = DpaeStatistics.get_most_recent_data_date()

        if '75' in settings.DEPARTEMENTS:
            departements = deque(settings.DEPARTEMENTS)
            departements.remove('75')
            departements.appendleft('75')
        else:
            departements = list(settings.DEPARTEMENTS)

        if COMPUTE_SCORES_DEBUG_MODE:
            if len(COMPUTE_SCORES_DEBUG_DEPARTEMENTS) >= 1:
                departements = COMPUTE_SCORES_DEBUG_DEPARTEMENTS

        for departement in departements:
            abortable_func = partial(abortable_worker,
                                     compute,
                                     timeout=COMPUTE_SCORE_TIMEOUT)
            async_result = pool.apply_async(
                abortable_func,
                args=(settings.OFFICE_TABLE, settings.DPAE_TABLE, departement,
                      most_recent_data_date))
            async_results[departement] = async_result

        logger.info("going to close process pool...")
        pool.close()
        logger.info("going to join pool")
        pool.join()
        logger.info("all compute_score done, analyzing results...")

        for departement, async_result in async_results.iteritems():
            try:
                result = async_result.get()
                if not bool(result):
                    logger.info("departement with error : %s", departement)
                results.append([departement, bool(result)])
            except:
                logger.error("traceback for unprocessed_departement: %s",
                             traceback.format_exc())

        logger.info("compute_scores FINISHED")
        return results
    def run_task(self):
        date_insertion = datetime.now()
        logger.info("extracting %s ", self.input_filename)
        # this pattern matches the first date
        # e.g. 'lbb_xdpdpae_delta_201611102200.bz2'
        # will match 2018-09-12
        date_pattern = r'.*_(\d\d\d\d\d\d\d\d)\d\d\d\d'  #We keep only the date in the file name, ex: 20190910 = 10th september 2019
        date_match = re.match(date_pattern, self.input_filename)
        if date_match:
            date_part = date_match.groups()[0]
            self.last_historical_data_date_in_file = datetime.strptime(
                date_part, "%Y%m%d")
            logger.debug("identified last_historical_data_date_in_file=%s",
                         self.last_historical_data_date_in_file)
        else:
            raise Exception(
                "couldn't find a date pattern in filename. filename should be \
                like lbb_xdpdpae_delta_YYYYMMDDHHMM.csv")

        count = 0
        statements = []
        something_new = False
        query = """
            INSERT into %s(
                siret,
                hiring_date,
                contract_type,
                departement,
                contract_duration,
                iiann,
                tranche_age,
                handicap_label,
                duree_pec,
                date_insertion
                )
            values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)
        """ % settings.HIRING_TABLE
        imported_dpae = 0
        imported_dpae_distribution = {}
        not_imported_dpae = 0
        last_historical_data_date_in_db = db_session.query(func.max(Hiring.hiring_date)) \
                                        .filter(Hiring.contract_type.in_((Hiring.CONTRACT_TYPE_CDI,
                                                                          Hiring.CONTRACT_TYPE_CDD,
                                                                          Hiring.CONTRACT_TYPE_CTT))).first()[0]
        if last_historical_data_date_in_db is None:
            last_historical_data_date_in_db = DEFAULT_DATETIME_DPAE
        logger.info(
            "will now extract all dpae with hiring_date between %s and %s",
            last_historical_data_date_in_db,
            self.last_historical_data_date_in_file)

        with import_util.get_reader(self.input_filename) as myfile:
            con, cur = import_util.create_cursor()
            header_line = myfile.readline().strip(
            )  # FIXME detect column positions from header
            if b"siret" not in header_line:
                logger.debug(header_line)
                raise Exception("wrong header line")
            for line in myfile:
                line = line.decode()
                count += 1
                if not count % 100000:
                    logger.debug("reading line %i", count)
                    try:
                        try:
                            cur.executemany(query, statements)
                        except OperationalError:  # retry once in case of deadlock error
                            time.sleep(10)
                            cur.executemany(query, statements)
                        statements = []
                        con.commit()
                        something_new = True
                    except:
                        logger.error(
                            "error in executing statement into dpae table: %s",
                            sys.exc_info()[1])
                        statements = []
                        raise
                try:
                    siret, hiring_date, _, contract_type, departement, contract_duration, \
                    iiann, tranche_age, handicap_label, duree_pec = parse_dpae_line(line)
                except ValueError:
                    self.zipcode_errors += 1
                    continue
                except InvalidRowException:
                    logger.info("invalid_row met at row: %i", count)
                    self.invalid_row_errors += 1
                    continue

                dpae_should_be_imported = (
                    hiring_date > last_historical_data_date_in_db
                    and hiring_date <= self.last_historical_data_date_in_file
                    # For DPAE contracts we only keep all CDI, only long enough CDD (at least 31 days)
                    # and we ignore CTT.
                    and (contract_type == Hiring.CONTRACT_TYPE_CDI or
                         (contract_type == Hiring.CONTRACT_TYPE_CDD
                          and contract_duration is not None
                          and contract_duration > 31)))

                if dpae_should_be_imported:
                    statement = (siret, hiring_date, contract_type,
                                 departement, contract_duration, iiann,
                                 tranche_age, handicap_label, duree_pec,
                                 date_insertion)
                    statements.append(statement)
                    imported_dpae += 1

                    if hiring_date.year not in imported_dpae_distribution:
                        imported_dpae_distribution[hiring_date.year] = {}
                    if hiring_date.month not in imported_dpae_distribution[
                            hiring_date.year]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month] = {}
                    if hiring_date.day not in imported_dpae_distribution[
                            hiring_date.year][hiring_date.month]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month][hiring_date.day] = 0
                    imported_dpae_distribution[hiring_date.year][
                        hiring_date.month][hiring_date.day] += 1
                else:
                    not_imported_dpae += 1

        # run remaining statements
        try:
            cur.executemany(query, statements)
            something_new = True
        except:
            logger.error("error in executing statement into dpae table: %s",
                         sys.exc_info()[1])
            raise

        logger.info("processed %i dpae...", count)
        logger.info("imported dpae: %i", imported_dpae)
        logger.info("not imported dpae: %i", not_imported_dpae)
        logger.info("zipcode errors: %i", self.zipcode_errors)
        logger.info("invalid_row errors: %i", self.invalid_row_errors)
        if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS:
            raise IOError('too many zipcode errors')
        if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS:
            raise IOError('too many invalid_row errors')
        logger.info("verifying good number of dpae imported.")
        query = "select count(*) from hirings h where hiring_date > %s and hiring_date <= %s and h.contract_type in (1,2,3)"
        cur.execute(query, [
            last_historical_data_date_in_db,
            self.last_historical_data_date_in_file
        ])
        res = cur.fetchone()
        if res[0] != imported_dpae:
            raise DoublonException(
                f"Too many DPAE ({res[0]}) in DB compared to DPAE file ({imported_dpae})."
            )
        logger.info("verifying number of DPAE: OK.")
        con.commit()
        cur.close()
        con.close()

        try:
            statistics = DpaeStatistics(
                last_import=datetime.now(),
                most_recent_data_date=self.last_historical_data_date_in_file,
                file_type=self.file_type)
            db_session.add(statistics)
            db_session.commit()
            logger.info("First way to insert DPAE statistics in DB : OK")
        except OperationalError:
            # For an obscure reason, the DpaeStatistics way to insert does not work on the bonaparte server
            # So we insert it directly via an SQL query
            # This job has been broken for more than a year, only way to fix it :
            db_session.rollback()
            last_import_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            most_recent_date = self.last_historical_data_date_in_file.strftime(
                '%Y-%m-%d %H:%M:%S')
            query = f"insert into dpae_statistics (last_import, most_recent_data_date, file_type) values ('{last_import_date}','{most_recent_date}','{self.file_type}')"
            con, cur = import_util.create_cursor()
            cur.execute(query)
            con.commit()
            cur.close()
            con.close()
            logger.info("Second way to insert DPAE statistics in DB : OK")

        logger.info("finished importing dpae...")
        return something_new
def get_dpae_last_historical_data_date():
    return DpaeStatistics.get_last_historical_data_date(DpaeStatistics.DPAE)
Exemplo n.º 5
0
    def run_task(self):
        logger.info("extracting %s ", self.input_filename)
        # this pattern matches the first date
        # e.g. 'LBB_XDPDPAE_2018-09-12_2017-08-01.bz2'
        # will match 2018-09-12
        date_pattern = r'.*_(\d\d\d\d-\d\d-\d\d)_'
        date_match = re.match(date_pattern, self.input_filename)
        if date_match:
            date_part = date_match.groups()[0]
            self.last_historical_data_date_in_file = datetime.strptime(
                date_part, "%Y-%m-%d")
            logger.debug("identified last_historical_data_date_in_file=%s",
                         self.last_historical_data_date_in_file)
        else:
            raise Exception(
                "couldn't find a date pattern in filename. filename should be \
                like LBB_XDPDPAE_YYYY-MM-DD_YYYY-MM-DD.csv")

        count = 0
        statements = []
        something_new = False
        query = """
            INSERT into %s(
                siret,
                hiring_date,
                contract_type,
                departement,
                contract_duration,
                iiann,
                tranche_age,
                handicap_label,
                duree_pec
                )
            values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)
        """ % settings.HIRING_TABLE
        imported_dpae = 0
        imported_dpae_distribution = {}
        not_imported_dpae = 0
        last_historical_data_date_in_db = DpaeStatistics.get_last_historical_data_date(
        )

        logger.info(
            "will now extract all dpae with hiring_date between %s and %s",
            last_historical_data_date_in_db,
            self.last_historical_data_date_in_file)

        with import_util.get_reader(self.input_filename) as myfile:
            con, cur = import_util.create_cursor()
            header_line = myfile.readline().strip(
            )  # FIXME detect column positions from header
            if b"siret" not in header_line:
                logger.debug(header_line)
                raise Exception("wrong header line")
            for line in myfile:
                line = line.decode()
                count += 1
                if not count % 100000:
                    logger.debug("reading line %i", count)
                    try:
                        try:
                            cur.executemany(query, statements)
                        except OperationalError:  # retry once in case of deadlock error
                            time.sleep(10)
                            cur.executemany(query, statements)
                        statements = []
                        con.commit()
                        something_new = True
                    except:
                        logger.error(
                            "error in executing statement into dpae table: %s",
                            sys.exc_info()[1])
                        statements = []
                        raise
                try:
                    siret, hiring_date, _, contract_type, departement, contract_duration, \
                    iiann, tranche_age, handicap_label, duree_pec = parse_dpae_line(line)
                except ValueError:
                    self.zipcode_errors += 1
                    continue
                except InvalidRowException:
                    logger.info("invalid_row met at row: %i", count)
                    self.invalid_row_errors += 1
                    continue

                dpae_should_be_imported = (
                    hiring_date > last_historical_data_date_in_db
                    and hiring_date <= self.last_historical_data_date_in_file
                    # For DPAE contracts we only keep all CDI, only long enough CDD (at least 31 days)
                    # and we ignore CTT.
                    and (contract_type == Hiring.CONTRACT_TYPE_CDI or
                         (contract_type == Hiring.CONTRACT_TYPE_CDD
                          and contract_duration is not None
                          and contract_duration > 31)))

                if dpae_should_be_imported:
                    statement = (
                        siret,
                        hiring_date,
                        contract_type,
                        departement,
                        contract_duration,
                        iiann,
                        tranche_age,
                        handicap_label,
                        duree_pec,
                    )
                    statements.append(statement)
                    imported_dpae += 1

                    if hiring_date.year not in imported_dpae_distribution:
                        imported_dpae_distribution[hiring_date.year] = {}
                    if hiring_date.month not in imported_dpae_distribution[
                            hiring_date.year]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month] = {}
                    if hiring_date.day not in imported_dpae_distribution[
                            hiring_date.year][hiring_date.month]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month][hiring_date.day] = 0
                    imported_dpae_distribution[hiring_date.year][
                        hiring_date.month][hiring_date.day] += 1
                else:
                    not_imported_dpae += 1

        # run remaining statements
        try:
            cur.executemany(query, statements)
            something_new = True
        except:
            logger.error("error in executing statement into dpae table: %s",
                         sys.exc_info()[1])
            raise

        logger.info("processed %i dpae...", count)
        logger.info("imported dpae: %i", imported_dpae)
        logger.info("not imported dpae: %i", not_imported_dpae)
        logger.info("zipcode errors: %i", self.zipcode_errors)
        logger.info("invalid_row errors: %i", self.invalid_row_errors)
        if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS:
            raise IOError('too many zipcode errors')
        if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS:
            raise IOError('too many invalid_row errors')
        con.commit()
        cur.close()
        con.close()
        statistics = DpaeStatistics(
            last_import=datetime.now(),
            most_recent_data_date=self.last_historical_data_date_in_file,
        )
        statistics.save()
        logger.info("finished importing dpae...")
        return something_new
    def run_task(self):
        date_insertion = datetime.now()
        logger.info("extracting %s ", self.input_filename)
        # this pattern matches the first date
        # e.g. '20200803ExtractApp'
        # will match 20200803
        date_string = self.input_filename.split('/')[-1][0:8] 
        try:
            self.last_historical_data_date_in_file = datetime.strptime(date_string, "%Y%m%d")
        except ValueError:
            raise Exception("couldn't find a date pattern in filename. filename should be \
                like 20200803ExtractApp.csv")

        count = 0
        statements = []
        something_new = False
        query = """
            INSERT into %s(
                siret,
                hiring_date,
                contract_type,
                departement,
                contract_duration,
                iiann,
                tranche_age,
                handicap_label,
                duree_pec,
                date_insertion
                )
            values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)
        """ % settings.HIRING_TABLE
        imported_alternance_contracts = 0
        imported_alternance_contracts_distribution = {}
        not_imported_alternance_contracts = 0

        last_historical_data_date_in_db = db_session.query(func.max(Hiring.hiring_date))\
                                                            .filter(Hiring.contract_type == self.contract_type).first()[0]

        logger.info("will now extract all alternance contracts with hiring_date between %s and %s",
                    last_historical_data_date_in_db, self.last_historical_data_date_in_file)

        with import_util.get_reader(self.input_filename) as myfile:
            con, cur = import_util.create_cursor()
            header_line = myfile.readline().strip()   # FIXME detect column positions from header
            
            if b"SIRET" not in header_line:
                logger.debug(header_line)
                raise Exception("wrong header line")

            for line in myfile:
                line = line.decode()
                count += 1
                if not count % 10000:
                    logger.debug("reading line %i", count)
                    try:
                        try:
                            cur.executemany(query, statements)
                        except OperationalError:  # retry once in case of deadlock error
                            time.sleep(10)
                            cur.executemany(query, statements)
                        statements = []
                        con.commit()
                        something_new = True
                    except:
                        logger.error("error in executing statement into hirings table: %s", sys.exc_info()[1])
                        statements = []
                        raise
                try:
                    siret, hiring_date, departement = parse_alternance_line(line)
                except InvalidRowException:
                    logger.info("invalid_row met at row: %i", count)
                    self.invalid_row_errors += 1
                    continue
                except InvalidSiretException:
                    error_message = traceback.format_exc()
                    logger.info("invalid siret met at row: %i", count)
                    logger.info(error_message)
                    self.invalid_siret_errors += 1
                    continue
                except InvalidZipCodeException:
                    logger.info("invalid zip code met at row: %i", count)
                    self.invalid_zipcode_errors += 1
                    continue
                
                # This part of code is useless : 
                #   The data used has a lot of late contracts inputs
                #   So we have to insert ALL the contracts from different dates

                #  alternance_contract_should_be_imported = (
                #      hiring_date > last_historical_data_date_in_db 
                #      and hiring_date <= self.last_historical_data_date_in_file
                #)

                if hiring_date <= self.last_historical_data_date_in_file:
                    statement = (
                        siret,
                        hiring_date,
                        self.contract_type,
                        departement,
                        None, #contract_duration
                        None, #iiann
                        None, #tranche_age
                        None, #handicap_label
                        None,  #duree_pec
                        date_insertion

                    )
                    statements.append(statement)
                    imported_alternance_contracts += 1

                    if hiring_date.year not in imported_alternance_contracts_distribution:
                        imported_alternance_contracts_distribution[hiring_date.year] = {}
                    if hiring_date.month not in imported_alternance_contracts_distribution[hiring_date.year]:
                        imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month] = {}
                    if hiring_date.day not in imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month]:
                        imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month][hiring_date.day] = 0
                    imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month][hiring_date.day] += 1

        # run remaining statements
        try:
            cur.executemany(query, statements)
            something_new = True
        except:
            logger.error("error in executing statement into hirings table: %s", sys.exc_info()[1])
            raise

        logger.info(f"Types de contrats à importer : {self.contract_name}")
        logger.info(f"processed {count} lba_contracts...")
        logger.info(f"imported lba_contracts: {imported_alternance_contracts}")
        logger.info(f"not imported lba_contracts: {not_imported_alternance_contracts}")
        logger.info(f"zipcode errors: {self.invalid_zipcode_errors}")
        logger.info(f"invalid_row errors: {self.invalid_row_errors}")
        logger.info(f"invalid siret errors: {self.invalid_siret_errors}")
#        if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS:
#            raise IOError('too many zipcode errors')
#        if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS:
#            raise IOError('too many invalid_row errors')

        con.commit()
        cur.close()
        con.close()

        try:
            statistics = DpaeStatistics(
                last_import=datetime.now(),
                most_recent_data_date=self.last_historical_data_date_in_file,
                file_type=self.file_type
            )
            db_session.add(statistics)
            db_session.commit()
            logger.info("First way to insert DPAE statistics in DB : OK")
        except OperationalError:
            # For an obscure reason, the DpaeStatistics way to insert does not work on the bonaparte server
            # So we insert it directly via an SQL query
            # This job has been broken for more than a year, only way to fix it : 
            db_session.rollback()
            last_import_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            most_recent_date = self.last_historical_data_date_in_file.strftime('%Y-%m-%d %H:%M:%S')
            query = f"insert into dpae_statistics (last_import, most_recent_data_date, file_type) values ('{last_import_date}','{most_recent_date}','{self.file_type}')"
            con, cur = import_util.create_cursor()
            cur.execute(query)
            con.commit()
            cur.close()
            con.close()
            logger.info("Second way to insert DPAE statistics in DB : OK")


        logger.info("finished importing dpae...")
        return something_new
Exemplo n.º 7
0
    def run_task(self):
        logger.info("extracting %s ", self.input_filename)
        date_pattern = ".*_(\d\d\d\d\d\d\d\d)"
        date_match = re.match(date_pattern, self.input_filename)
        if date_match:
            date_part = date_match.groups()[-1]
            self.most_recent_data_date = datetime.strptime(date_part, "%Y%m%d")
            logger.debug("identified most_recent_data_date=%s" %
                         self.most_recent_data_date)
        else:
            raise Exception(
                "couldn't find a date pattern in filename. filename should be dpae_XYZ_20xxxxxx.tar.gz"
            )

        count = 0
        statements = []
        something_new = False
        query = "INSERT into %s(siret, hiring_date, zipcode, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label) values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)" % settings.DPAE_TABLE
        imported_dpae = 0
        imported_dpae_distribution = {}
        not_imported_dpae = 0
        initial_most_recent_data_date = DpaeStatistics.get_most_recent_data_date(
        )

        logger.info(
            "will now extract all dpae with hiring_date between %s and %s" %
            (initial_most_recent_data_date, self.most_recent_data_date))

        with import_util.get_reader(self.input_filename) as myfile:
            con, cur = import_util.create_cursor()
            header_line = myfile.readline().strip()
            if "siret" not in header_line:
                logger.debug(header_line)
                raise Exception("wrong header line")
            for line in myfile:
                count += 1
                if not count % 100000:
                    logger.debug("reading line %i", count)
                    try:
                        try:
                            cur.executemany(query, statements)
                        except OperationalError:  # retry once in case of deadlock error
                            time.sleep(10)
                            cur.executemany(query, statements)
                        statements = []
                        con.commit()
                        something_new = True
                    except:
                        logger.error(
                            "error in executing statement into dpae table: %s",
                            sys.exc_info()[1])
                        statements = []
                        raise
                try:
                    siret, hiring_date, zipcode, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label = parse_dpae_line(
                        line)
                except ValueError:
                    self.zipcode_errors += 1
                    continue
                except DepartementException:
                    self.zipcode_errors += 1
                    continue
                except TooFewFieldsException:
                    logger.info("invalid_row met at row: %i", count)
                    self.invalid_row_errors += 1
                    continue

                if hiring_date > initial_most_recent_data_date and hiring_date <= self.most_recent_data_date:
                    statement = (siret, hiring_date, zipcode, contract_type,
                                 departement, contract_duration, iiann,
                                 tranche_age, handicap_label)
                    statements.append(statement)
                    imported_dpae += 1

                    if hiring_date.year not in imported_dpae_distribution:
                        imported_dpae_distribution[hiring_date.year] = {}
                    if hiring_date.month not in imported_dpae_distribution[
                            hiring_date.year]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month] = {}
                    if hiring_date.day not in imported_dpae_distribution[
                            hiring_date.year][hiring_date.month]:
                        imported_dpae_distribution[hiring_date.year][
                            hiring_date.month][hiring_date.day] = 0
                    imported_dpae_distribution[hiring_date.year][
                        hiring_date.month][hiring_date.day] += 1
                else:
                    not_imported_dpae += 1

        # run remaining statements
        try:
            cur.executemany(query, statements)
            something_new = True
        except:
            logger.error("error in executing statement into dpae table: %s",
                         sys.exc_info()[1])
            raise

        logger.info("processed %i dpae...", count)
        logger.info("imported dpae: %i", imported_dpae)
        logger.info("not imported dpae: %i", not_imported_dpae)
        logger.info("zipcode errors: %i", self.zipcode_errors)
        logger.info("invalid_row errors: %i", self.invalid_row_errors)
        if self.zipcode_errors >= 100:
            raise Exception('too many zipcode errors')
        if self.invalid_row_errors >= 100:
            raise Exception('too many invalid_row errors')
        statistics = DpaeStatistics(
            last_import=datetime.now(),
            most_recent_data_date=self.most_recent_data_date)
        statistics.save()
        con.commit()
        logger.info("finished importing dpae...")
        return something_new