def getconfs(dbConfig): jsonobj = etl_utils.parseconfs(dbConfig) dic = {} dic["--connect"] = jsonobj["db.url"] # 使用的用户名 dic["--username"] = jsonobj["db.username"] # 使用的密码 dic["--password"] = '******' % (jsonobj["db.password"]) print "=================================" + qp_dt dic["-m"] = "1" dic["--hive-import"] = " " partitionname = "biz_end_date" # partitionvalue ="30001231" partitionvalue = qp_dt partitionstr = " --hive-partition-key %s --hive-partition-value %s " % (partitionname, partitionvalue) dic["--hive-table"] = jsonobj["hive_db"] + "." + jsonobj["hive_table"] + partitionstr tablename = jsonobj["hive_db"] + "." + jsonobj["hive_table"] truncateTable(tablename, qp_dt) dic["--split-by"] = jsonobj["sqoop.split-by"] querySql = buildQuerySql(jsonobj) dic["--query"] = '" %s "' % (querySql,) print "sqoop import sql:" + dic["--query"] current_time = datetime.datetime.now() date_str = current_time.strftime("%Y%m%d") date_str = etlDate time_str = current_time.strftime("%Y%m%d%H%M") dic["--fields-terminated-by"] = " '\\001' " dic["--null-string"] = " '\\\N' " dic["--null-non-string"] = " '\\\N' " # dic["--as-parquetfile"] = " " dic["--verbose"] = " " dic["--inline-lob-limit"] = "16777216" tableBasePath = "%s/%s/%s/%s%s/%s/" % ( jsonobj["hdfs.root"], jsonobj["hdfs.category.input"], "init", jsonobj["hdfs.db_name"], jsonobj["hdfs.table_name"], jsonobj["hdfs.schema_version"], ) dic["--target-dir"] = tableBasePath + "%s/%s" % (date_str, time_str) print "temp dir:%s" % (dic["--target-dir"],) jsonConfigList.append(dic) return dic
def getconfs(dbConfig): jsonobj = etl_utils.parseconfs(dbConfig) dic = {} dic["--connect"] = jsonobj["db.url"] # 使用的用户名 dic["--username"] = jsonobj["db.username"] # 使用的密码 dic["--password"] = '******' % (jsonobj["db.password"]) print "=================================" + qp_dt dic["-m"] = "1" dic["--hive-import"] = " " partitionname = "biz_end_date" #partitionvalue ="30001231" partitionvalue = qp_dt partitionstr = " --hive-partition-key %s --hive-partition-value %s " % ( partitionname, partitionvalue) dic["--hive-table"] = jsonobj["hive_db"] + "." + jsonobj[ "hive_table"] + partitionstr tablename = jsonobj["hive_db"] + "." + jsonobj["hive_table"] truncateTable(tablename, qp_dt) dic["--split-by"] = jsonobj["sqoop.split-by"] querySql = buildQuerySql(jsonobj) dic["--query"] = '" %s "' % (querySql, ) print "sqoop import sql:" + dic["--query"] current_time = datetime.datetime.now() date_str = current_time.strftime("%Y%m%d") date_str = etlDate time_str = current_time.strftime("%Y%m%d%H%M") dic["--fields-terminated-by"] = " '\\001' " dic["--null-string"] = " '\\\N' " dic["--null-non-string"] = " '\\\N' " #dic["--as-parquetfile"] = " " dic["--verbose"] = " " dic["--inline-lob-limit"] = "16777216" tableBasePath = "%s/%s/%s/%s%s/%s/" % ( jsonobj["hdfs.root"], jsonobj["hdfs.category.input"], "init", jsonobj["hdfs.db_name"], jsonobj["hdfs.table_name"], jsonobj["hdfs.schema_version"]) dic["--target-dir"] = tableBasePath + "%s/%s" % (date_str, time_str) print "temp dir:%s" % (dic["--target-dir"], ) jsonConfigList.append(dic) return dic
def check_data(dbConfig, qp_dt, sys_dt, g_where_condition, fromdb, round_time): jsonobj = etl_utils.parseconfs(dbConfig) g_source_table = jsonobj["db.table_name"] g_source_user = jsonobj["db.username"] g_source_pass = jsonobj["db.password"] g_hive_db = jsonobj["hive_db"] g_hive_table = jsonobj["hive_table"] g_source_db = jsonobj["db.database"] # source DB type: oracle / mysql db_url = jsonobj["db.url"] g_source_type = db_url.split(":")[1] db_url = db_url[db_url.find("//") + 2:] url_items = db_url.split("/") g_source_tns = url_items[1] g_source_host = url_items[0].split(":")[0] g_source_port = url_items[0].split(":")[1] g_run_time = sys_dt g_data_date = qp_dt g_round = round_time g_run_type = fromdb if g_run_type == 'source': # 源数据库记录个数 row_count = get_source_count(g_source_db, g_source_type, g_source_host, g_source_port, g_source_tns, g_source_user, g_source_pass, g_source_table, g_where_condition) # 写入知识库 log_conn = get_mysql_conn(ETL_LOG_HOST, ETL_LOG_PORT, ETL_LOG_DB, ETL_LOG_USER, ETL_LOG_PASS) if g_round == 1: insert_etl_log(log_conn, g_source_tns, g_source_table, g_data_date, g_run_time, row_count) elif g_round == 2: update_source_etl_log(log_conn, g_source_tns, g_source_table, g_data_date, g_run_time, row_count) else: # 目标数据库记录个数 row_count = get_target_count(g_hive_db, g_hive_table, g_where_condition) # 写入知识库 log_conn = get_mysql_conn(ETL_LOG_HOST, ETL_LOG_PORT, ETL_LOG_DB, ETL_LOG_USER, ETL_LOG_PASS) update_target_etl_log(log_conn, g_source_tns, g_source_table, g_data_date, g_run_time, row_count) return 0
def getconfs(dbConfig): jsonobj = etl_utils.parseconfs(dbConfig) dic = {} dic["--connect"] = jsonobj["db.url"] # 使用的用户名 dic["--username"] = jsonobj["db.username"] # 使用的密码 dic["--password"] = '******' % (jsonobj["db.password"]) dic["-m"] = "1" dic["--split-by"] = jsonobj["sqoop.split-by"] querySql = buildQuerySql(jsonobj) dic["--query"] = '"%sand $CONDITIONS"' % (querySql, ) print "sqoop import sql:" + dic["--query"] current_time = datetime.datetime.now() date_str = current_time.strftime("%Y%m%d") date_str = etlDate time_str = current_time.strftime("%Y%m%d%H%M") tableBasePath = "%s/%s/%s/%s/%s/" % ( jsonobj["hdfs.root"], jsonobj["hdfs.category.input"], jsonobj["hdfs.db_name"], jsonobj["hdfs.table_name"], jsonobj["hdfs.schema_version"]) dic["--target-dir"] = tableBasePath + "%s/%s" % (date_str, time_str) print "target dir: " + dic["--target-dir"] dic["--fields-terminated-by"] = " '\001' " dic["--hive-drop-import-delims"] = " " #dic["--fields-terminated-by"] = " '\\001' " dic["--null-string"] = " '\\\N' " dic["--null-non-string"] = " '\\\N' " #dic["--as-parquetfile"] = " " dic["--verbose"] = " " dic["--inline-lob-limit"] = "16777216" jsonConfigList.append(dic) return dic
def getconfs(dbConfig): jsonobj = etl_utils.parseconfs(dbConfig) dic = {} dic["--connect"] = jsonobj["db.url"] # 使用的用户名 dic["--username"] = jsonobj["db.username"] # 使用的密码 dic["--password"] = '******'%(jsonobj["db.password"]) dic["-m"]= "1" dic["--split-by"]= jsonobj["sqoop.split-by"] querySql = buildQuerySql(jsonobj) dic["--query"]= '"%sand $CONDITIONS"'%(querySql,) print "sqoop import sql:" + dic["--query"] current_time = datetime.datetime.now() date_str = current_time.strftime("%Y%m%d") date_str = etlDate time_str = current_time.strftime("%Y%m%d%H%M") tableBasePath = "%s/%s/%s/%s/%s/"%(jsonobj["hdfs.root"],jsonobj["hdfs.category.input"],jsonobj["hdfs.db_name"], jsonobj["hdfs.table_name"],jsonobj["hdfs.schema_version"]) dic["--target-dir"] = tableBasePath + "%s/%s"%(date_str,time_str) print "target dir: " + dic["--target-dir"] dic["--fields-terminated-by"] = " '\001' " dic["--hive-drop-import-delims"] = " " #dic["--fields-terminated-by"] = " '\\001' " dic["--null-string"] = " '\\\N' " dic["--null-non-string"] = " '\\\N' " #dic["--as-parquetfile"] = " " dic["--verbose"] = " " dic["--inline-lob-limit"] = "16777216" jsonConfigList.append(dic) return dic
def parseconfs(dbConfig): jsonobj = etl_utils.parseconfs(dbConfig) dbConfigFileName = dbConfig print "table json Schema file: " + dbConfigFileName dic = {} dic["hive_table"] = jsonobj["hive_table"] dic["hive_db"] = jsonobj["hive_db"] dic["table_name"] = jsonobj["db.table_name"] colsList = jsonobj["columns"] srcTbKeys = jsonobj["db.table_keys"] srcTbKeysList = srcTbKeys.split(",") srcTbKeysList = etl_utils.formatList(srcTbKeysList) colsStr = "" count = 0 joincount = 0 colStr_md5 = "" colStr_as_h = "" colStr_as_m = "" colStr_h = "" colStr_coalesce = "" colStr_m = "" joinColStr = "" p_k = "" for col in colsList: #if col["primary_key"]=="true" or (col["name"] in srcTbKeysList): if (col["name"].upper() in srcTbKeysList): joinCol = "h.h_" + col["name"] + "=" + "m.m_" + col["name"] p_k = col["name"] if joincount == 0: joinColStr = joinColStr + joinCol else: joinColStr = joinColStr + " and " + joinCol joincount = joincount + 1 colName = col["name"] colType = col["type"] if count == 0: #用于拼H表字段串 colStr_as_h = colStr_as_h + "%s" % (colName) + " as " + "h_%s" % ( colName) #用于拼m表字段串 colStr_as_m = colStr_as_m + "%s" % (colName) + " as " + "m_%s" % ( colName) #用于拼第一个插入字段串 colStr_h = colStr_h + "h_%s" % (colName) #用于拼第二个字段串 colStr_coalesce = colStr_coalesce + "coalesce(h_%s" % ( colName) + " , " + "m_%s)" % (colName) #用于拼第二个字段串 colStr_m = colStr_m + "m_%s" % (colName) #用于拼md5串 if colType[0:7] == "decimal": colName = "cast(" + colName + " as string)" colStr_md5 = colStr_md5 + "%s" % (colName) else: colStr_md5 = colStr_md5 + "%s" % (colName) else: #用于拼H表字段串 colStr_as_h = colStr_as_h + ",%s" % (colName) + " as " + "h_%s" % ( colName) #用于拼m表字段串 colStr_as_m = colStr_as_m + ",%s" % (colName) + " as " + "m_%s" % ( colName) #用于拼第一个插入字段串 colStr_h = colStr_h + ",h_%s" % (colName) #用于拼第二个字段串 colStr_coalesce = colStr_coalesce + ",coalesce(h_%s" % ( colName) + " , " + "m_%s)" % (colName) #用于拼第三个字段串 colStr_m = colStr_m + ",m_%s" % (colName) #用于拼md5串 if colType[0:7] == "decimal": colName = "cast(" + colName + " as string)" colStr_md5 = colStr_md5 + ",%s" % (colName) else: colStr_md5 = colStr_md5 + ",%s" % (colName) count = count + 1 print "joinColStr: " + joinColStr dic["joinColStr"] = joinColStr print "p_k: " + p_k dic["p_k"] = p_k print "colStr_md5:%s" % (colStr_md5, ) dic["colStr_md5"] = colStr_md5 print "colStr_as_h:%s" % (colStr_as_h, ) dic["colStr_as_h"] = colStr_as_h print "colStr_as_m:%s" % (colStr_as_m, ) dic["colStr_as_m"] = colStr_as_m print "colStr_h:%s" % (colStr_h, ) dic["colStr_h"] = colStr_h print "colStr_coalesce:%s" % (colStr_coalesce, ) dic["colStr_coalesce"] = colStr_coalesce print "colStr_m:%s" % (colStr_m, ) dic["colStr_m"] = colStr_m jsonConfigList.append(dic) return dic
def parseconfs(dbConfig): jsonobj = etl_utils.parseconfs(dbConfig) dbConfigFileName = dbConfig print "table json Schema file: " + dbConfigFileName dic = {} dic["hive_table"]=jsonobj["hive_table"] dic["hive_db"] = jsonobj["hive_db"] dic["table_name"] = jsonobj["db.table_name"] colsList = jsonobj["columns"] srcTbKeys = jsonobj["db.table_keys"] srcTbKeysList = srcTbKeys.split(",") srcTbKeysList = etl_utils.formatList(srcTbKeysList) colsStr = "" count = 0 joincount = 0 colStr_md5 = "" colStr_as_h = "" colStr_as_m = "" colStr_h = "" colStr_coalesce = "" colStr_m = "" joinColStr = "" p_k = "" for col in colsList: #if col["primary_key"]=="true" or (col["name"] in srcTbKeysList): if (col["name"].upper() in srcTbKeysList): joinCol = "h.h_" + col["name"] + "=" + "m.m_" + col["name"] p_k = col["name"] if joincount == 0: joinColStr = joinColStr + joinCol else: joinColStr = joinColStr + " and " + joinCol joincount = joincount +1 colName = col["name"] colType = col["type"] if count == 0: #用于拼H表字段串 colStr_as_h = colStr_as_h + "%s"%(colName) + " as " + "h_%s"%(colName) #用于拼m表字段串 colStr_as_m = colStr_as_m + "%s"%(colName) + " as " + "m_%s"%(colName) #用于拼第一个插入字段串 colStr_h = colStr_h + "h_%s"%(colName) #用于拼第二个字段串 colStr_coalesce = colStr_coalesce + "coalesce(h_%s"%(colName) + " , " + "m_%s)"%(colName) #用于拼第二个字段串 colStr_m = colStr_m + "m_%s"%(colName) #用于拼md5串 if colType[0:7]=="decimal": colName = "cast(" + colName + " as string)" colStr_md5 = colStr_md5 + "%s"%(colName) else: colStr_md5 = colStr_md5 + "%s"%(colName) else: #用于拼H表字段串 colStr_as_h = colStr_as_h + ",%s"%(colName) + " as " + "h_%s"%(colName) #用于拼m表字段串 colStr_as_m = colStr_as_m + ",%s"%(colName) + " as " + "m_%s"%(colName) #用于拼第一个插入字段串 colStr_h = colStr_h + ",h_%s"%(colName) #用于拼第二个字段串 colStr_coalesce = colStr_coalesce + ",coalesce(h_%s"%(colName) + " , " + "m_%s)"%(colName) #用于拼第三个字段串 colStr_m = colStr_m + ",m_%s"%(colName) #用于拼md5串 if colType[0:7]=="decimal": colName = "cast(" + colName + " as string)" colStr_md5 = colStr_md5 + ",%s"%(colName) else: colStr_md5 = colStr_md5 + ",%s"%(colName) count = count + 1 print "joinColStr: " + joinColStr dic["joinColStr"] = joinColStr print "p_k: " + p_k dic["p_k"] = p_k print "colStr_md5:%s"%(colStr_md5,) dic["colStr_md5"] = colStr_md5 print "colStr_as_h:%s"%(colStr_as_h,) dic["colStr_as_h"] = colStr_as_h print "colStr_as_m:%s"%(colStr_as_m,) dic["colStr_as_m"] = colStr_as_m print "colStr_h:%s"%(colStr_h,) dic["colStr_h"] = colStr_h print "colStr_coalesce:%s"%(colStr_coalesce,) dic["colStr_coalesce"] = colStr_coalesce print "colStr_m:%s"%(colStr_m,) dic["colStr_m"] = colStr_m jsonConfigList.append(dic) return dic