def deleteT(db, date): tables1 = gethiveT(db) #传库名 tables2 = getCT(db) #传库名 conn = MySQL(config.washmeta) tables1 = getDBs(tables1) #传库名 len2 = len(tables2) flag = True if len2 == 0: for table1 in tables1: table1['Create_Dt'] = date table1['Data_Tbl_UUID'] = uuid.uuid1() logging.debug('table1:' + table1['Data_Tbl_Phys_Nm']) conn.insert("data_tbl", table1) for table2 in tables2: for table1 in tables1: if table2.get('Data_Tbl_Phys_Nm') == table1.get( 'Data_Tbl_Phys_Nm'): flag = False break if flag: logging.debug('table1:' + table1['Data_Tbl_Phys_Nm']) try: conn.execute( "update data_tbl set Del_Dt='{}' where Data_Tbl_Phys_Nm='{}'" .format(date, table2.get('Data_Tbl_Phys_Nm'))) except Exception: logging.error('删除表元数据失败,数据为:' + str(table2)) print traceback.format_exc() sys.exit(1) flag = True del conn
def insertNewP(db, date): partitions1 = gethiveP(db) #传库名 partitions2 = getCP(db) #传库名 conn = MySQL(config.washmeta) partitions1 = getTableID(partitions1) len2 = len(partitions2) if len2 == 0: for p1 in partitions1: logging.debug('partition:' + str(p1['Data_Tblid']) + ':' + p1['Dp_Path']) try: conn.insert("dp", p1) except Exception: logging.error('第一次插入分区数据失败,数据为:' + str(p1)) print traceback.format_exc() sys.exit(1) else: for p1 in partitions1: flag = True for p2 in partitions2: if p1.get('Data_Tblid') == p2.get('Data_Tblid') and p1.get( 'Dp_Path') == p2.get('Dp_Path'): flag = False break if flag: logging.debug('插入分区partition:' + str(p1['Data_Tblid']) + ':' + p1['Dp_Path']) try: conn.insert("dp", p1) except Exception: logging.error('插入分区数据失败,数据为:' + str(p1)) print traceback.format_exc() sys.exit(1) for p2 in partitions2: flag = True for p1 in partitions1: if p2.get('Data_Tblid') == p1.get('Data_Tblid') and p2.get( 'Dp_Path') == p1.get('Dp_Path'): flag = False break if flag: logging.debug('删除分区partition:' + str(p2['Data_Tblid']) + ':' + p2['Dp_Path']) try: conn.execute( "delete from dp where Data_Tblid='{}' and Dp_Path='{}'" .format(p2.get('Data_Tblid'), p2.get('Dp_Path'))) except Exception: logging.error('删除分区数据失败,数据为:' + str(p1)) print traceback.format_exc() sys.exit(1) del conn
def get_comment(table_id): db = MySQL(config.dqc_mysql) result = db.execute(const.TABLE_COMMENT, (table_id, )) table_comment = str(result[0].get("Data_Tbl_Cn_Nm") or '').strip('\n').strip('\r').replace(';', '') result = db.execute(const.FIELD_COMMENT, (table_id, )) field_comment = [ str(rs.get("Fld_Cn_Nm") or '').strip('\n').strip('\r').replace(';', '') for rs in result ] del db return table_comment, field_comment
def insertCByT(db, tb): conn1 = MySQL(config.hivemeta) conn2 = MySQL(config.washmeta) hivesql = """select '{}' AS Data_Tblid, t1.COLUMN_NAME as Fld_Phys_Nm, t1.COMMENT as Fld_Cn_Nm, t1.TYPE_NAME as Fld_Data_Type, t1.INTEGER_IDX as Fld_Ord from columns_v2 t1 left join sds t2 on t1.cd_id = t2.cd_id left join tbls t3 on t2.sd_id = t3.sd_id left join dbs t4 on t3.db_id=t4.db_id where t4.name='{}' and t3.tbl_name='{}'""" cs1 = conn1.execute( hivesql.format(tb.get('Data_Tbl_Phys_Nm'), db, tb.get('Data_Tbl_Phys_Nm'))) cs1 = getTableID(cs1) for c in cs1: c['Create_Dt'] = tb['Create_Dt'] try: logging.debug("插入新增表字段:" + str(c['Fld_Phys_Nm'])) conn2.insert('data_fld', c) except Exception as e: logging.error("插入新增表字段失败:" + str(c['Fld_Phys_Nm'])) print traceback.format_exc() del conn1 del conn2
def sanhuangua_join_profile(data, ds): mysql = MySQL() # delete delete_sql = 'delete from nsh_sanhuangua_tmp' mysql.execute(delete_sql) # insert mysql.batch_insert('nsh_sanhuangua_tmp', ['role_id', 'suspect_score'], data) PROFILE_SQL = """ select a.suspect_score, b.* from anti_plugin.nsh_sanhuangua_tmp a join luoge_nsh_mid.mid_role_portrait_all_d b on a.role_id = b.role_id where b.ds = '{ds}' """ sql = PROFILE_SQL.format(ds=ds) logging.info(sql) params = { 'sql': sql, 'needReturn': 'true' } # 关联画像请求,返回结果 result = requests.post(SHUYUAN_URL, timeout=6000, json=params) # json转成字典 id_profile_dict = {} for line in result.json()['data']: profile_dict = {} for k, v in line.items(): k = k.split('.')[-1] v = '0' if v is None else v profile_dict[k] = v profiles = [profile_dict.get(col, '') for col in PROFILE_COLS] role_id = profile_dict['role_id'] id_profile_dict[role_id] = profiles # 结果字典 profile_data = list() for role_id, profiles in id_profile_dict.items(): row = [role_id] + profiles profile_data.append(row) return profile_data
def get_target_database(database, usage): db = MySQL(config.dqc_mysql) result = db.execute(const.TARGET_DATABASE, (database, usage)) del db if len(result) == 0: raise exception.DQCException( "target database is non-exist. [database:%s]" % database) return result[0]["db_phys_nm"]
def get_change_ddl(table_id): flag = False db = MySQL(config.dqc_mysql) result = db.execute(mask_const.TABLE_CHANGE, (table_id, )) if len(result) != 0: flag = True del db return flag
def compareC(db, table1, table2): conn1 = MySQL(config.hivemeta) conn2 = MySQL(config.washmeta) c1 = conn1.execute( const.getTCSql.format(db, table1.get('Data_Tbl_Phys_Nm'))) c2 = conn2.execute( const.getCTCSql.format(db, table2.get('Data_Tbl_Phys_Nm'))) del conn1 del conn2 c1.sort() c2.sort() if cmp(c1, c2) != 0: # print 'c1:',c1 # print 'c2:',c2 return True else: return False
def get_database_table(table_id): db = MySQL(config.dqc_mysql) result = db.execute(const.DB_TABLE, (table_id, )) del db if len(result) == 0: raise exception.DQCException( "database table is non-exist. [table_id:%s]" % table_id) return result[0]["db_phys_nm"], result[0]["data_tbl_phys_nm"]
def get_check_item(): db = MySQL(config.dqc_mysql) result = db.execute(const.CHECK_ITEM) del db item = {} for rs in result: item[rs['Chk_Proj_Cd']] = rs['Chk_Projid'] return item
def get_partition_path(table_id, partition_date): db = MySQL(config.dqc_mysql) result = db.execute(const.PARTITION_PATH, (table_id, partition_date)) del db if len(result) == 0: return None else: return result[0]["dp_path"]
def get_label_id(table_id): db = MySQL(config.dqc_mysql) result = db.execute(mask_const.MASK_LABLE, (table_id, )) del db if len(result) == 0: raise exception.MaskException( "table lable is non-exist. [table_id:%s]" % table_id) return result[0]["Labelid"]
def get_field(table_id): db = MySQL(config.dqc_mysql) result = db.execute(const.FIELD_TABLE, (table_id, )) del db item = {} for rs in result: item[rs['Fld_Phys_Nm']] = rs['Fldid'] return item
def get_partition_latest(table_id): db = MySQL(config.dqc_mysql) result = db.execute(const.MAX_PARTITION_DATE, (table_id, )) del db if result[0]["latest"] is None: raise exception.DQCException( "table partition is non-exist. [table_id:%s]" % table_id) else: return result[0]["latest"]
def get_mask_cmpu(): db = MySQL(config.dqc_mysql) result = db.execute(mask_const.MASK_CMPU) del db mask_cmpu = { rs["Data_Wash_Cmpu_Cd"]: rs["Data_Wash_Cmpuid"] for rs in result } return mask_cmpu
def getCC(db): try: conn = MySQL(config.washmeta) columns = conn.execute(const.getCCSql.format(db)) except Exception: logging.error('获取清洗库 字段数据 失败!') print traceback.format_exc() sys.exit(1) del conn return columns
def gethiveC(db): try: conn = MySQL(config.hivemeta) columns = conn.execute(const.getCSql.format(db)) except Exception: logging.error('获取hive 字段元数据失败') print traceback.format_exc() sys.exit(1) del conn return columns
def getCT(db): try: conn = MySQL(config.washmeta) tables = conn.execute(const.getCTSql.format(db)) except Exception: logging.error('获取清洗库 表元数据 失败!') print traceback.format_exc() sys.exit(1) del conn return tables
def gethiveP(db): try: conn = MySQL(config.hivemeta) partitions = conn.execute(const.getPSql.format(db)) except Exception: logging.error('获取hive 分区元数据失败!') print traceback.format_exc() sys.exit(1) del conn return partitions
def compareP(db, table1, table2): conn1 = MySQL(config.hivemeta) conn2 = MySQL(config.washmeta) ps1 = conn1.execute( const.getTPSql.format(db, table1.get('Data_Tbl_Phys_Nm'))) ps2 = conn2.execute( const.getCTPSql.format(db, table2.get('Data_Tbl_Phys_Nm'))) del conn1 del conn2 flag = False for p1 in ps1: flag = True for p2 in ps2: if p1 == p2: flag = False ps2.remove(p2) break if flag == True: break return flag
def updateC(db, date): columns1 = gethiveC(db) #传库名 columns2 = getCC(db) #传库名 conn = MySQL(config.washmeta) columns1 = getTableID(columns1) len2 = len(columns2) flag = True if len2 == 0: for c1 in columns1: c1['Create_Dt'] = date logging.debug('column:' + str(c1['Data_Tblid']) + ':' + c1['Fld_Phys_Nm']) try: conn.insert("data_fld", c1) except Exception: logging.error('第一次插入字段数据失败,失败数据为:' + str(c1)) print traceback.format_exc() sys.exit(1) else: for c1 in columns1: flag = True for c2 in columns2: if c1.get('Data_Tblid') == c2.get('Data_Tblid') and c1.get('Fld_Phys_Nm') == c2.get('Fld_Phys_Nm') and c1.get('Fld_Cn_Nm') == c2.get('Fld_Cn_Nm') \ and c1.get('Fld_Data_Type') == c2.get('Fld_Data_Type') and c1.get('Fld_Ord') == c2.get('Fld_Ord'): flag = False break if flag: c1['Upd_Dt'] = date logging.debug('column:' + str(c1['Data_Tblid']) + ':' + c1['Fld_Phys_Nm']) try: conn.execute( "update data_fld set Fld_Cn_Nm='{}',Fld_Data_Type='{}',Fld_Ord='{}',Upd_Dt='{}' where Data_Tblid='{}' and Fld_Phys_Nm='{}'" .format(c1['Fld_Cn_Nm'], c1['Fld_Data_Type'], c1['Fld_Ord'], c1['Upd_Dt'], c1['Data_Tblid'], c1['Fld_Phys_Nm'])) except Exception: logging.error('插入新增字段数据失败,数据为:' + str(c1)) print traceback.format_exc() sys.exit(1) del conn
def getDBs(tables): try: conn = MySQL(config.washmeta) dbs = conn.execute(const.getDBs) except Exception: logging.error('获取库ID 失败!') print traceback.format_exc() sys.exit(1) for table in tables: for db in dbs: if table.get('Dbid') == db.get('Db_Phys_Nm'): table['Dbid'] = db['Dbid'] return tables
def getTableID(tables1): try: conn = MySQL(config.washmeta) tables2 = conn.execute( 'select Data_Tblid,Data_Tbl_Phys_Nm from data_tbl') except Exception: logging.error('获取表ID 失败!') print traceback.format_exc() sys.exit(1) for t1 in tables1: for t2 in tables2: if t1.get('Data_Tblid') == t2.get('Data_Tbl_Phys_Nm'): t1['Data_Tblid'] = t2.get('Data_Tblid') return tables1
def updateT(db, date): tables1 = gethiveT(db) #传库名 tables2 = getCT(db) #传库名 conn = MySQL(config.washmeta) tables1 = getDBs(tables1) #传库名 len2 = len(tables2) if len2 == 0: for table1 in tables1: table1['Create_Dt'] = date table1['Data_Tbl_UUID'] = uuid.uuid1() logging.debug('table1:' + table1['Data_Tbl_Phys_Nm']) try: conn.insert("data_tbl", table1) except Exception: logging.error('第一次插入表数据失败,插入数据是:' + str(table1)) print traceback.format_exc() sys.exit(1) for table1 in tables1: for table2 in tables2: if table1.get('Data_Tbl_Phys_Nm') == table2.get( 'Data_Tbl_Phys_Nm'): # print 'hive 表:',table1.get('Data_Tbl_Phys_Nm') # print 'clean 表:',table2.get('Data_Tbl_Phys_Nm') # if (not compareP(table1, table2) or not compareC(table1, table2)): if (compareP(db, table1, table2) or compareC(db, table1, table2)): logging.debug('对比表table1:' + table1['Data_Tbl_Phys_Nm']) try: conn.execute( "update data_tbl set Upd_Dt='{}' where Data_Tbl_Phys_Nm='{}'" .format(date, table1.get('Data_Tbl_Phys_Nm'))) except Exception: logging.error('更新表元数据失败,数据为:' + str(table1)) print traceback.format_exc() sys.exit(1) del conn
def mask_job_type(jobid): """ 作业类型 Job_Type 2 立即执行数据脱敏 3 周期执行数据脱敏 """ db = MySQL(config.dqc_mysql) result = db.execute(mask_const.JOB_TYPE, (jobid, )) del db if len(result) == 0: raise exception.MaskException("jobid is non-exist. [jobid:%s]" % jobid) job_type = result[0]["job_type"] if job_type != mask_const.MASK_IMM_MODE and job_type != mask_const.MASK_FREQ_MODE: raise exception.MaskException("job type is invalid. [jobid:%s]" % jobid) return job_type
def get_metadata_field(table_id): db = MySQL(config.dqc_mysql) result = db.execute(const.TABLE_FIELD, (table_id, )) del db field = [] datatype = [] pk = [] null = [] for rs in result: field.append(rs["fld_phys_nm"]) datatype.append(rs["fld_data_type"]) if rs["if_pk"] == 1: pk.append(rs["fld_phys_nm"]) if rs["if_can_null"] == 0: null.append(rs["fld_phys_nm"]) return field, datatype, pk, null
def get_field_check(table_id): """ 数据结构说明: { "字段1": [规则1,规则2, ...], "字段2": [规则1], ... } """ db = MySQL(config.dqc_mysql) result = db.execute(const.FIELD_CHECK, (table_id, )) del db check = collections.OrderedDict() items = [] for rs in result: if check.get(rs["fld_phys_nm"], None) is None: items.clear() if rs["chk_proj_cd"] in items: continue else: items.append(rs["chk_proj_cd"]) check[rs["fld_phys_nm"]] = items.copy() return check
def access(): db = MySQL(config.dqc_mysql) result = db.execute(ACCESS_SQL) del db return result
class Tables: """Create or Drop tables,delete data from tables """ def __init__(self): self._logger = Logger(__file__) try: fsock = open("sqls.xml", "r") except IOError: self._logger.error("The file don't exist, Please double check!") self.sqls = BeautifulSoup(fsock.read()) dbconfig = {'host':'127.0.0.1', 'port': 3306, 'user':'******', 'passwd':'123456', 'db':'scenic', 'charset':'utf8'} self.db = MySQL(dbconfig) def initDB(self): """create all tables """ createSqls = self.sqls.find(id="createSql") for item in createSqls.select("item"): sql = item.string self._logger.info("create the table "+item.attrs["id"]) self.db.execute(sql) # must reopen the cursor, or it will raise exception with error code 1024. What a f*****g error self.db.reopenCursor() def createTable(self,name): """create a specified table """ create = self.sqls.find(id="createSql").find(id=name).string if create: self._logger.info(" create table "+name) self.db.execute(create) else: self._logger.error("error occured when create table "+name) def dropAll(self): """drop all the tables """ dropSqls= self.sqls.find(id="dropSql") for item in dropSqls.select("item"): sql = item.string self._logger.info("drop the table "+item.attrs["id"]) self.db.execute(sql) def dropTable(self,name): """drop specified table """ drop = self.sqls.find(id="dropSql").find(name) if drop: self._logger.info("drop the table "+name) self.db.execute(sql) else: self._logger.warn("Don't have the table "+name) def cleanAll(self): """delete data from all the tables,but not drop tables """ cleanSqls= self.sqls.find(id="cleanSql") for item in cleanSqls.select("item"): sql = item.string self._logger.info("clean the table "+item.attrs["id"]) self.db.execute(sql) def cleanTable(self,name): """clean the data of specified table """ pass def insertTable(self,name,params): """insert values int to the specified table # Parameters: name: the name of the table params: the value insert into the tables. It can be tuple for inserting a row,or can be a list to insert serveral rows # Return: """ insert = self.sqls.find(id="insertSql").find(id=name).string if insert: self._logger.info(" insert into table "+name) self.db.insert(insert,params) else: self._logger.error("did not find the table "+name+" when insert") def insertData(self,data): """It is the interface for outer calling # Parameters: data: the value insert into the tables. It can be tuple for inserting a row,or can be a list to insert serveral rows # Return: """ if isinstance(data,Scenic): data.encode() types = self.joint(data.types) seasons = self.joint(data.fits) sceneryParams = (data.id,data.name,data.province,data.city,data.area,data.level,data.quality,data.description,data.website,data.symbol,data.opentime,data.closetime,data.price,data.suggest,seasons,types,data.longitude,data.latitude,data.precise,data.confidence) imageParams = [] for item in data.images: imageParams.append( (data.id,str(uuid.uuid1()),item,data.name,data.name) ) self.insertTable("scenery",sceneryParams) # insert into database when only there are pictures,or it will occur error if imageParams: self.insertTable("sceneryImages",imageParams) else: self._logger.error("the parameter is not the instance of Scenic") return False def joint(self,data,split=","): """Joint list with split parameter,default is , """ result = "" if isinstance(data,list): length = len(data) if length > 0: result = result+data[0] for i in range(1,length): result = result+split+data[i] return result def initTables(self): """Initial basic tables including sceneryType,season """ basic = SearchParams() # insert basic data into sceneryType table params = [] for item in basic.scenicType.keys(): params.append((basic.scenicType[item],item,item)) self.insertTable("sceneryType",params) # insert basic data into season table params = [] for item in basic.scenicFit.keys(): params.append((basic.scenicFit[item],item)) self.insertTable("season",params)
def share(): db = MySQL(config.dqc_mysql) result = db.execute(SHARE_SQL) del db return result