示例#1
0
    def import_cbs_data_to_s3(self, emails_num=2, email_search_start_date=""):
        self.mail.login()
        self.mail.select_dir(self.mail_dir)
        recent_cbs_emails = self.get_recent_cbs_emails(
            emails_num=emails_num,
            email_search_start_date=email_search_start_date)
        for msgId, mtime in recent_cbs_emails:
            typ, message_parts = self.mail.imap_session.fetch(
                msgId, "(RFC822)")
            if typ != "OK":
                logging.error("Error fetching mail.")
                raise Exception("Error fetching mail")

            email_body = message_parts[0][1]
            if type(email_body) is bytes:
                email_body = email_body.decode("utf-8")
            mail = email.message_from_string(email_body)
            for part in mail.walk():
                if (part.get_content_maintype() == "multipart"
                        or part.get("Content-Disposition") is None):
                    continue
                filename = part.get_filename()

                if bool(filename) and filename.endswith(".zip"):
                    filename = "{0}-{1}_{2}-{3}.zip".format(
                        "cbs_data", mtime.date(), mtime.hour, mtime.minute)
                    self.create_temp_files_dir()
                    filepath = os.path.join(self.get_temp_files_dir(),
                                            filename)
                    if os.path.isfile(filepath):
                        break
                    logging.info("Currently loading: " + filename + "       ")
                    with open(filepath, "wb+") as fp:
                        fp.write(part.get_payload(decode=True))
                    non_zip_path = filepath.replace("zip", "")
                    with zipfile.ZipFile(filepath, "r") as zf:
                        zf.extractall(non_zip_path)
                    preprocessing_cbs_files.update_cbs_files_names(
                        non_zip_path)
                    acc_data_file_path = preprocessing_cbs_files.get_accidents_file_data(
                        non_zip_path)
                    provider_code, year = get_file_type_and_year(
                        acc_data_file_path)

                    s3_uploader = S3Uploader()

                    # delete current cbs data from s3
                    s3_uploader.delete_from_s3(provider_code, year)

                    # upload new cbs data to s3
                    for file in os.scandir(non_zip_path):
                        s3_uploader.upload_to_S3(local_file_path=file.path,
                                                 provider_code=provider_code,
                                                 year=year)
                    self.delete_temp_files_dir()

        self.mail.imap_session.close()
        self.mail.imap_session.logout()
示例#2
0
def main(
    specific_folder,
    delete_all,
    path,
    batch_size,
    delete_start_date,
    load_start_year,
    from_email,
    username="",
    password="",
    email_search_start_date="",
    from_s3=False,
):
    try:
        if not from_email and not from_s3:
            import_ui = ImporterUI(path, specific_folder, delete_all)
            dir_name = import_ui.source_path()

            if specific_folder:
                dir_list = [dir_name]
            else:
                dir_list = glob.glob("{0}/*/*".format(dir_name))

            # wipe all the AccidentMarker and Vehicle and Involved data first
            if import_ui.is_delete_all():
                truncate_tables(db, (Vehicle, Involved, AccidentMarker))
            elif delete_start_date is not None:
                delete_cbs_entries(delete_start_date, batch_size)
            started = datetime.now()
            total = 0
            for directory in sorted(dir_list, reverse=False):
                directory_name = os.path.basename(os.path.normpath(directory))
                year = directory_name[1:5] if directory_name[0] == "H" else directory_name[0:4]
                if int(year) >= int(load_start_year):
                    parent_directory = os.path.basename(
                        os.path.dirname(os.path.join(os.pardir, directory))
                    )
                    provider_code = get_provider_code(parent_directory)
                    logging.info("Importing Directory " + directory)
                    total += import_to_datastore(directory, provider_code, int(year), batch_size)
                else:
                    logging.info(
                        "Importing only starting year {0}. Directory {1} has year {2}".format(
                            load_start_year, directory_name, year
                        )
                    )
        elif from_s3:
            logging.info("Importing data from s3...")
            s3_handler = S3Handler()
            s3_handler.get_files_from_s3(start_year=load_start_year)
            """
            Should be soon implemented as "delete_entries_from_S3"
            """
            # delete_cbs_entries_from_email(provider_code, year, batch_size)
            if delete_start_date is not None:
                delete_cbs_entries(delete_start_date, batch_size)
            started = datetime.now()
            total = 0
            for provider_code in [
                BE_CONST.CBS_ACCIDENT_TYPE_1_CODE,
                BE_CONST.CBS_ACCIDENT_TYPE_3_CODE,
            ]:
                for year in range(int(load_start_year), s3_handler.current_year + 1):
                    cbs_files_dir = os.path.join(
                        s3_handler.local_files_directory,
                        ACCIDENTS_TYPE_PREFIX + "_" + str(provider_code),
                        str(year),
                    )
                    logging.info("Importing Directory " + cbs_files_dir)
                    preprocessing_cbs_files.update_cbs_files_names(cbs_files_dir)
                    acc_data_file_path = preprocessing_cbs_files.get_accidents_file_data(
                        cbs_files_dir
                    )
                    total += import_to_datastore(cbs_files_dir, provider_code, year, batch_size)
            shutil.rmtree(s3_handler.local_temp_directory)
        else:
            logging.info("Importing data from mail...")
            temp_dir = tempfile.mkdtemp()
            zip_path = importmail_cbs.main(temp_dir, username, password, email_search_start_date)
            if zip_path is None:
                logging.info("No new cbs files found")
                return
            zip_ref = zipfile.ZipFile(zip_path, "r")
            cbs_files_dir = os.path.join(temp_dir, "cbsfiles")
            if not os.path.exists(cbs_files_dir):
                os.makedirs(cbs_files_dir)
            zip_ref.extractall(cbs_files_dir)
            zip_ref.close()
            preprocessing_cbs_files.update_cbs_files_names(cbs_files_dir)
            acc_data_file_path = preprocessing_cbs_files.get_accidents_file_data(cbs_files_dir)
            provider_code, year = get_file_type_and_year(acc_data_file_path)
            delete_cbs_entries_from_email(provider_code, year, batch_size)
            started = datetime.now()
            total = 0
            logging.info("Importing Directory " + cbs_files_dir)
            total += import_to_datastore(cbs_files_dir, provider_code, year, batch_size)
            shutil.rmtree(temp_dir)

        fill_db_geo_data()

        failed = [
            "\t'{0}' ({1})".format(directory, fail_reason)
            for directory, fail_reason in failed_dirs.items()
        ]
        logging.info(
            "Finished processing all directories{0}{1}".format(
                ", except:\n" if failed else "", "\n".join(failed)
            )
        )
        logging.info("Total: {0} items in {1}".format(total, time_delta(started)))

        create_tables()
    except Exception as ex:
        print("Exception occured while loading the cbs data: {0}".format(str(ex)))
        print("Traceback: {0}".format(traceback.format_exc()))