def import_cbs_data_to_s3(self, emails_num=2, email_search_start_date=""): self.mail.login() self.mail.select_dir(self.mail_dir) recent_cbs_emails = self.get_recent_cbs_emails( emails_num=emails_num, email_search_start_date=email_search_start_date) for msgId, mtime in recent_cbs_emails: typ, message_parts = self.mail.imap_session.fetch( msgId, "(RFC822)") if typ != "OK": logging.error("Error fetching mail.") raise Exception("Error fetching mail") email_body = message_parts[0][1] if type(email_body) is bytes: email_body = email_body.decode("utf-8") mail = email.message_from_string(email_body) for part in mail.walk(): if (part.get_content_maintype() == "multipart" or part.get("Content-Disposition") is None): continue filename = part.get_filename() if bool(filename) and filename.endswith(".zip"): filename = "{0}-{1}_{2}-{3}.zip".format( "cbs_data", mtime.date(), mtime.hour, mtime.minute) self.create_temp_files_dir() filepath = os.path.join(self.get_temp_files_dir(), filename) if os.path.isfile(filepath): break logging.info("Currently loading: " + filename + " ") with open(filepath, "wb+") as fp: fp.write(part.get_payload(decode=True)) non_zip_path = filepath.replace("zip", "") with zipfile.ZipFile(filepath, "r") as zf: zf.extractall(non_zip_path) preprocessing_cbs_files.update_cbs_files_names( non_zip_path) acc_data_file_path = preprocessing_cbs_files.get_accidents_file_data( non_zip_path) provider_code, year = get_file_type_and_year( acc_data_file_path) s3_uploader = S3Uploader() # delete current cbs data from s3 s3_uploader.delete_from_s3(provider_code, year) # upload new cbs data to s3 for file in os.scandir(non_zip_path): s3_uploader.upload_to_S3(local_file_path=file.path, provider_code=provider_code, year=year) self.delete_temp_files_dir() self.mail.imap_session.close() self.mail.imap_session.logout()
def main( specific_folder, delete_all, path, batch_size, delete_start_date, load_start_year, from_email, username="", password="", email_search_start_date="", from_s3=False, ): try: if not from_email and not from_s3: import_ui = ImporterUI(path, specific_folder, delete_all) dir_name = import_ui.source_path() if specific_folder: dir_list = [dir_name] else: dir_list = glob.glob("{0}/*/*".format(dir_name)) # wipe all the AccidentMarker and Vehicle and Involved data first if import_ui.is_delete_all(): truncate_tables(db, (Vehicle, Involved, AccidentMarker)) elif delete_start_date is not None: delete_cbs_entries(delete_start_date, batch_size) started = datetime.now() total = 0 for directory in sorted(dir_list, reverse=False): directory_name = os.path.basename(os.path.normpath(directory)) year = directory_name[1:5] if directory_name[0] == "H" else directory_name[0:4] if int(year) >= int(load_start_year): parent_directory = os.path.basename( os.path.dirname(os.path.join(os.pardir, directory)) ) provider_code = get_provider_code(parent_directory) logging.info("Importing Directory " + directory) total += import_to_datastore(directory, provider_code, int(year), batch_size) else: logging.info( "Importing only starting year {0}. Directory {1} has year {2}".format( load_start_year, directory_name, year ) ) elif from_s3: logging.info("Importing data from s3...") s3_handler = S3Handler() s3_handler.get_files_from_s3(start_year=load_start_year) """ Should be soon implemented as "delete_entries_from_S3" """ # delete_cbs_entries_from_email(provider_code, year, batch_size) if delete_start_date is not None: delete_cbs_entries(delete_start_date, batch_size) started = datetime.now() total = 0 for provider_code in [ BE_CONST.CBS_ACCIDENT_TYPE_1_CODE, BE_CONST.CBS_ACCIDENT_TYPE_3_CODE, ]: for year in range(int(load_start_year), s3_handler.current_year + 1): cbs_files_dir = os.path.join( s3_handler.local_files_directory, ACCIDENTS_TYPE_PREFIX + "_" + str(provider_code), str(year), ) logging.info("Importing Directory " + cbs_files_dir) preprocessing_cbs_files.update_cbs_files_names(cbs_files_dir) acc_data_file_path = preprocessing_cbs_files.get_accidents_file_data( cbs_files_dir ) total += import_to_datastore(cbs_files_dir, provider_code, year, batch_size) shutil.rmtree(s3_handler.local_temp_directory) else: logging.info("Importing data from mail...") temp_dir = tempfile.mkdtemp() zip_path = importmail_cbs.main(temp_dir, username, password, email_search_start_date) if zip_path is None: logging.info("No new cbs files found") return zip_ref = zipfile.ZipFile(zip_path, "r") cbs_files_dir = os.path.join(temp_dir, "cbsfiles") if not os.path.exists(cbs_files_dir): os.makedirs(cbs_files_dir) zip_ref.extractall(cbs_files_dir) zip_ref.close() preprocessing_cbs_files.update_cbs_files_names(cbs_files_dir) acc_data_file_path = preprocessing_cbs_files.get_accidents_file_data(cbs_files_dir) provider_code, year = get_file_type_and_year(acc_data_file_path) delete_cbs_entries_from_email(provider_code, year, batch_size) started = datetime.now() total = 0 logging.info("Importing Directory " + cbs_files_dir) total += import_to_datastore(cbs_files_dir, provider_code, year, batch_size) shutil.rmtree(temp_dir) fill_db_geo_data() failed = [ "\t'{0}' ({1})".format(directory, fail_reason) for directory, fail_reason in failed_dirs.items() ] logging.info( "Finished processing all directories{0}{1}".format( ", except:\n" if failed else "", "\n".join(failed) ) ) logging.info("Total: {0} items in {1}".format(total, time_delta(started))) create_tables() except Exception as ex: print("Exception occured while loading the cbs data: {0}".format(str(ex))) print("Traceback: {0}".format(traceback.format_exc()))