_process_nums = 6 # 单任务的执行文件 _data_export_py = os.path.join(sys.path[0], "data_export.py") # 日志路径 _log_path = os.path.join(config.LOG_PATH, "data_export") if not os.path.exists(_log_path): os.makedirs(_log_path) # mysql数据库链接 con = create_engine(get_engine_str("mysql").format(**config.DB_CONF), poolclass=pool.NullPool) Dlogger = config.get_logger("DataExportAuto") def parallel_write_log(no, id, connection_name, db_name, table_name, last_exec_date, retry_count): try: current_date = str(datetime.date.today()) if last_exec_date == current_date: print("{no:<3} {table_name:<40} 今日已执行成功 skip.".format( no=no, table_name=db_name + "." + table_name)) return "" full_log_path = "{log_path}/sqoop_export_{db_name}_{table_name}.log".format( log_path=_log_path, db_name=db_name, table_name=table_name) sh_cmd = "{python} -u {date_export_py} -w {connection_name} --db {db_name} --tb {table_name} &>> {full_log_path}\n". \ format(python=config.PYTHON3,
#!/usr/bin/env python # coding=utf-8 import datetime import pandas as pd from sqlalchemy import create_engine, pool from utility import get_engine_str from conf import config Dlogger = config.get_logger("DataRowCount") mysql_engine_str = get_engine_str("mysql").format(**config.DB_CONF) mysql_con = create_engine(mysql_engine_str, poolclass=pool.NullPool) def update_rowcount(row): sql = "select count(1) as cnt from {table_name}".format( table_name=row["table_name"]) db_conf = { "host": row["host"], "port": row["port"], "user": row["user"], "password": row["password"], "database": row["db_name"], "charset": "utf8" } engine_str = get_engine_str(row["db_type"]).format(**db_conf) con = create_engine(engine_str, poolclass=pool.NullPool) # Dlogger.debug(sql) try: result = pd.read_sql(sql, con) table_rows = result.iat[0, 0]
#!/usr/bin/env python # coding=utf-8 """ 选择shell执行日志更加详细 """ import re import datetime import subprocess import pandas as pd import argparse from sqlalchemy import create_engine, pool from conf import config from utility import get_engine_str Dlogger = config.get_logger("DataMerge") def merge_data(pms_table): clinic_table = "pet_medical.mir_clinic" # HIS诊疗模块没有的表,老小暖的诊疗相关表数据 if re.search("CEMRecord|CPrescriptions|CPrescriptionDetails|CPhysical|CEMRecordPhysical|CLabReport|CLabVirusDetails|PBCheckList|PXRaysList|PPathologyList", pms_table, re.I): if "cemrecord" in pms_table.lower(): file_type = "rcfile" else: file_type = "orc" sql = """ drop table {data_xiaonuan_final_table}; create table {data_xiaonuan_final_table} stored as {file_type} as select * from {pms_table}; """
#!/usr/bin/env python # coding=utf-8 import argparse import sys import re import subprocess import pandas as pd from sqlalchemy import create_engine, pool from conf import config from utility import get_engine_str, is_valid, get_yesterday Dlogger = config.get_logger("DataUnique") def is_increase(hive_full_table): sql = """select filter,max_value from meta_import where lower(hive_database)=lower('{hive_database}') and lower(hive_table)=lower('{hive_table}') limit 1;""" \ .format(hive_database=hive_full_table.split(".")[0], hive_table=hive_full_table.split(".")[1]) engine_str = get_engine_str("mysql").format(**config.DB_CONF) con = create_engine(engine_str, poolclass=pool.NullPool) Dlogger.info(sql) # rows=con.execute(sql) df = pd.read_sql(sql=sql, con=con) if not df.empty: filter = df.iat[0, 0] max_value = df.iat[0, 1] if is_valid(filter) and is_valid(max_value): return True return False
#!/usr/bin/env python # coding=utf-8 import sys from sqlalchemy import create_engine, pool import pandas as pd import argparse from conf import config from utility import get_engine_str from schema_check import get_tabs Dlogger = config.get_logger("InsertMetaData") def pre_args(): parse = argparse.ArgumentParser(prog="InsertMetaData", description="I am help message...") parse.add_argument("-w", "--wizard", required=True, help="wizard,选择已经添加的数据库配置名称. example: -w warmsoft") parse.add_argument("--db", required=True, help="database,指定需要同步的数据库名称") parse.add_argument("--target_db", help="指定同步到hive中的库名,不填默认和db相同") args = parse.parse_args() print(args) args_dict = { "connection_id": None, "connection_name": args.wizard, "db_type": "", "host": "", "user": "", "password ": "", "port": 0, "jdbc_extend": "",
#!/usr/bin/env python # coding=utf-8 from sqlalchemy import create_engine, pool import pandas as pd import re import subprocess import traceback from conf import config from utility import get_engine_str, send_mail # Dlogger.basicConfig(level=Dlogger.INFO, format='%(asctime)s - %(levelname)s: %(message)s') Dlogger = config.get_logger("SchemaCheck") def get_tabs(db_type, db_conf): tabs = [] if db_type == "sqlserver": sql = "select name as table_name from sys.tables;" Dlogger.info("MSSQL Command = " + sql) elif db_type == "mysql": sql = "select table_name from information_schema.tables t where t.table_schema='{db_name}'".format( db_name=db_conf["database"]) Dlogger.info("MySQL Command = " + sql) elif db_type == "oracle": sql = "select table_name from user_tables" Dlogger.info("Oracle command = " + sql) else: raise Exception("DATABASE TYPE ERROR !") engine_str = get_engine_str(db_type).format(**db_conf)
# coding=utf-8 import pandas as pd from sqlalchemy import create_engine, pool import argparse import subprocess import datetime import sys import os import re import schema_check from conf import config from utility import get_engine_str, get_yesterday, is_valid from data_unique import drop_duplicates Dlogger = config.get_logger("DataImport") def pre_args(): parse = argparse.ArgumentParser(prog="DataImport", description="I am help message...") parse.add_argument("-w", "--wizard", required=True, help="wizard,选择已经添加的数据库配置名称. example: -w warmsoft") parse.add_argument("--db", default="", help="<database> meta_import中的db_name库名,不区分大小写") parse.add_argument("--tb", default="", help="<table_name> meta_import中的table_name表名,不区分大小写")
import pandas as pd from sqlalchemy import create_engine, pool import pymysql import argparse import subprocess import datetime import sys import os import re import schema_check import traceback from conf import config from utility import get_engine_str, get_yesterday, is_valid Dlogger = config.get_logger("DataExport") def pre_args(): parse = argparse.ArgumentParser( prog="DataExport", description= "I am help message...默认模式是把数据导入到临时表,然后rename为正式表。Example1: python3 data_export.py -w xiaonuan_ddl --db xiaonuan --tb syscategory Example2: python3 data_export.py -w xiaonuan_ddl --s data_xiaonuan_final.syscategory --t syscategory --mode=overwrite" ) parse.add_argument("-w", "--wizard", required=True, help="wizard,选择已经添加的数据库配置名称. example: -w xiaonuan_ddl") parse.add_argument("--db", default="", help="<database> meta_export中的db_name库名,不区分大小写")
# 并行数 _process_nums = 8 # 任务失败重试次数 _retry_count = 3 # 单任务的执行文件 _data_merge_py = os.path.join(sys.path[0], "data_merge.py") # 日志路径 _log_path = os.path.join(config.LOG_PATH, "data_merge") if not os.path.exists(_log_path): os.makedirs(_log_path) Dlogger = config.get_logger("DataMergeAuto") def parallel_write_log(no, tb): try: table_name = "mid_" + tb.lower() input_table_name = "pet_medical.ods_pms_" + tb.lower() full_log_path = "{log_path}/data_merge_{table_name}.log".format( log_path=_log_path, table_name=table_name) sh_cmd = "{python} -u {data_merge_py} -t {input_table_name} &>> {full_log_path}\n". \ format(python=config.PYTHON3, data_merge_py=_data_merge_py, input_table_name=input_table_name, full_log_path=full_log_path) flag = ""