def apendDataFrame(self, apenddf, resultfile, resultsheet, header): """ 向目标文件追加数据 :param apenddf: 追加的 dataframe :param resultfile: 目标文件路径 :param resultsheet: 目标文件的 sheet 名 :param header: 是否需要保留 header TODO """ # 写入的文件dataframe,engine:以操作工具包执行、mode必须为读状态否者会新增sheet writer = pd.ExcelWriter(resultfile, engine='openpyxl') book = load_workbook(resultfile) writer.book = book # 创建 sheets writer.sheets = dict((ws.title, ws) for ws in book.worksheets) # 获取追加行数 try: dfNum = pd.DataFrame(pd.read_excel(resultfile, sheet_name=resultsheet)) except Exception as e: print(str(e)) JSExcelHandler.errorlog("追加合并数据-{}".format(resultfile)) # TODO 这里要进行列的判断 newRowsNum = dfNum.shape[0] + 1 # 写入文件 apenddf.to_excel(excel_writer=writer, sheet_name=resultsheet, index=False, header=header, startrow=newRowsNum) writer.save()
def lowerDimensionOfTitle(self, path, startrow=0): """ 进行降范式操作 :param path: 目标文件的路径 :param startrow: 开始降范式的起始行(因为会有标题的情况,不属于表头范围,需要去掉) :return: 最底层子树的标题, 追加数据的起始行, keywords """ readOpenXlsx, sheetnames, tempPath = JSExcelHandler().OpenXls(path) rSheet = readOpenXlsx.sheet_by_name(sheetnames[0]) valuelist = [] # 表头层级最下面的的标题 keywords= [] lastrow = 0 # 表格的列边界 ecol = rSheet.ncols resultCells = rSheet.merged_cells # mergeCellCount = len(mergedCells) mergedCells = [] # 这里要加从第几行开始降范式,默认是从 0 行开始, 默认是从 0 行开始 if startrow > 0: for index, cell in enumerate(resultCells): crlow, crhign, cclow, cchigh = cell if crlow >= startrow: mergedCells.append(cell) else: mergedCells = resultCells mergeCellCount = len(mergedCells) # 没有 mergeCell 的情况, 默认第一行为表头 if mergeCellCount == 0: for colindex in range(0, ecol): value = rSheet.cell_value(startrow, colindex) valuelist.append(value) keywords.append(value) # print("判断--" + rSheet.name + "--的合并单元格已结束") # 参数返回:范式标识(False为一范式,True为多范式)、表头值、数据行坐标 print(valuelist) lastrow = 1 + startrow return valuelist, lastrow, keywords else: levelmap = self.createLevelMap(mergedCells, rSheet) maxlevel = max(levelmap.keys()) dummy = self.createLevelTree(levelmap, rSheet) # 根据层级来生成最终的 valuelist, keywords self.travel(dummy, valuelist, keywords, maxlevel) # print(keywords) # 这里应该比较所有叶子节点的下边界 lowestnodes = levelmap[maxlevel] # print(lowestnodes) headrow = 0 for node in lowestnodes: if len(node.child) > 0: headrow = maxlevel + 2 # break rlow, rhign, clow, chigh = node.cellrange if (rhign > headrow): headrow = rhign lastrow = headrow # lastrow = max(levelmap.keys()) + 1 return valuelist, lastrow, keywords
def idCardVerification(row): if JSExcelHandler.checkIsNull(row): if not JSExcelHandler.checkIDNumber(row): return 'false' else: return '' else: return 'false'
def autoExtractSheetHead(self, configuremodel): storepath = configuremodel.storepath # 所有需要合并的表格数据 exceldatalist = JSExcelHandler.getPathFromRootFolder(storepath) excelhandler = JSExcelController() excelhandler.getPathFromRootFolder() print("开始搜索表头")
def restoreHead(self, oldfilepath, sheetname, newfile, title, startrow=0): """ 因为根据模板表头进行匹配,所以将表头与数据部分分开来,先生成表头部分,再将数据部分进行拼接一次性写入,减少 IO操作 :param oldfilepath: :param sheetname: :param newfile: :param title: :param startrow: :return: """ readOpenXlsx, sheetnames, tempPath = JSExcelHandler().OpenXls(oldfilepath) writer = pd.ExcelWriter(newfile, engine='xlsxwriter') workbook = writer.book merge_format = workbook.add_format({'align': 'center'}) title.to_excel(writer, sheet_name=sheetnames[0], index=False, header=False) rsheet = readOpenXlsx.sheet_by_name(sheetname) resultCells = rsheet.merged_cells mergedCells = [] # 根据原表中的 cellrange 确定合并单元格的范围, 筛选出 header 部分有合并单元格的情况 if startrow > 0: for index, cell in enumerate(resultCells): crlow, crhign, cclow, cchigh = cell if crlow < startrow: mergedCells.append(cell) else: mergedCells = resultCells for cellrange in mergedCells: crlow, crhign, cclow, cchigh = cellrange if crlow < startrow: worksheet = writer.sheets[sheetnames[0]] value = rsheet.cell_value(crlow, cclow) """ Merge a range of cells. Args: first_row: The first row of the cell range. (zero indexed). first_col: The first column of the cell range. last_row: The last row of the cell range. (zero indexed). last_col: The last column of the cell range. data: Cell data. cell_format: Cell Format object. """ # print("{}-{}-{}-{}-{}".format(crlow, cclow, crhign - 1, cchigh - 1, value)) worksheet.merge_range(crlow, cclow, crhign - 1, cchigh - 1, value, merge_format) writer.save() return writer, sheetnames
def __init__(self, master, controller): my_path = os.path.dirname(os.path.realpath(sys.argv[0])) my_path = os.path.join(my_path, 'ETLFile') self.result = os.path.join(my_path, 'result') self.delegate = controller self.master = master # self.master.config(bg='blue') self.face = tkinter.Frame(self.master, ) self.face.pack() # 主窗口 # 创建根据标准表头进行合并数据的按钮 self.ETLButton = tkinter.Button(self.face, command=self.ETLButtonHandler, text="执行校验") self.ETLButton.grid(row=5, column=2) self.datatext = tkinter.Label(self.face, text='校验文件路径', bd=4) self.datatext.grid(row=0, column=1) # self.datalabel = tkinter.Entry(self.face, width=80) # self.datalabel.grid(row=0, column=2) # self.datalabel.insert(END, result) self.headstext = tkinter.Label(self.face, text='身份证校验字段', bd=4) self.headstext.grid(row=1, column=1) self.headslabel = tkinter.Entry(self.face, width=80) self.headslabel.grid(row=1, column=2) self.headslabel.insert(END, '身份证') self.validrangetext = tkinter.Label(self.face, text='空值校验字段', bd=4) self.validrangetext.grid(row=3, column=1) self.validrangelabel = tkinter.Entry(self.face, width=80) self.validrangelabel.grid(row=3, column=2) self.validrangelabel.insert(END, '单位,金额') self.datetext = tkinter.Label(self.face, text='统一时间列', bd=4) self.datetext.grid(row=4, column=1) self.datelabel = tkinter.Entry(self.face, width=80) self.datelabel.grid(row=4, column=2) self.datelabel.insert(END, '入职时间') self.btn_back = tkinter.Button(self.face, text='返回批处理界面', command=self.back) self.btn_back.grid(row=6, column=2) self.comboxlist = ttk.Combobox(self.face, width= 75, postcommand = self.pathCallback) # 初始化 self.comboxlist["values"] = JSExcelHandler.getPathFromRootFolder(self.result) # self.comboxlist.current(0) # 选择第一个 # self.comboxlist.bind("<<ComboboxSelect>>",callbackFunc) # 绑定事件,(下拉列表框被选中时,绑定go()函数) self.comboxlist.grid(row=0, column=2)
def ReadDataThenCompareAndExtract(self, configuremodel): # 存放表头的路径 datapath = configuremodel.datapath # 读取所有数据文件 datafilelist = JSExcelHandler.getPathFromRootFolder(datapath) # 类型是 map headsmaplist = self.readStandardHeadFromFolder(configuremodel) for path in datafilelist: for df in headsmaplist.values(): # 获取模板表头的行数,用于数据表中获取表头范围 totalrows = len(df.index) # 根据标准表头获取数据里的表头进行比对 datadf = pd.read_excel(path, nrows=totalrows) if datadf.equals(df): print('is the same')
def readStandardHeadFromFolder(self, configuremodel): headspath = configuremodel.headspath excellist = JSExcelHandler.getPathFromRootFolder(headspath) dfmaplist = {} # 从标准头路径中读取标准表头的格式,为比对做准备 for index, excelpath in enumerate(excellist): newdf = pd.read_excel(str(excelpath)) # '/Users/sun/Desktop/heads/head3.xlsx' if len(dfmaplist) > 0: for olddf in dfmaplist.values(): if newdf.equals(olddf) is True: break else: # 以路径来作为唯一标识,因为路径是唯一的 dfmaplist[excelpath] = newdf return dfmaplist
def __readFile(self, filepath): lines = JSExcelHandler.readtxt(filepath) varmap = {} for index, line in enumerate(lines): if index == 0: # 抽取文件的路径 varmap['datapath'] = line.strip() elif index == 1: # 读取 heads 的路径 varmap['headspath'] = line.strip() elif index == 2: # 最终合表结果存放路径 varmap['storepath'] = line.strip() elif index == 3: # 进行比对的起始行 varmap['validrange'] = self.transformStringToMap(line.strip()) elif index == 4: # 进行比对的起始行 varmap['reserveHead'] = line.strip() else: varmap['keywords'] = line.strip().split(',') return varmap
def newfilesave(self, oldfile, newfile, valuelist, startrow=0, headers=False): """ :param oldfile: :param newfile: :param valuelist: :param startrow: :param headers: :return: """ readOpenXlsx, sheetnames, tempPath = JSExcelHandler().OpenXls(oldfile) # 如果有 startrow 则表明有标题,暂时没有生成标题 if startrow > 0: headers = True if headers: title = pd.read_excel(oldfile, header=None, nrows=startrow) xlwter, sheetnames = self.restoreHead(oldfile, sheetnames[0], newfile, title, startrow) df = pd.DataFrame(columns=valuelist) df.to_excel(newfile, sheet_name=sheetnames[0], index=False) # self.apendDataFrame(df, newfile, sheetnames[0], headers) else: df = pd.DataFrame(columns=valuelist) df.to_excel(newfile, sheet_name=sheetnames[0], index=False) return newfile, sheetnames[0]
def readStandardHeadFromFolder(self, configuremodel): """ 从标准表头存放路径遍历出所有的模板,生成 文件名:模板 dataframe 的 map :param configuremodel: view 层的 model :return: 标准表头文件名 : 标准表头 dataframe """ headspath = configuremodel.headspath headlist = [] headlist = JSExcelHandler.getPathFromRootFolder(headspath, headlist) dfmaplist = {} dflist = list(dfmaplist.keys()) # 从标准头路径中读取标准表头的格式,为比对做准备 for excelpath in headlist: filename = os.path.split(excelpath)[-1] startrow = 0 if filename in configuremodel.validrange: startrow = int(configuremodel.validrange[filename]) # try: # path, sheet_name=sheetname, nrows=totalrows, skiprows=startrow, header=None # titledf = pd.read_excel(path, sheet_name=sheetname, nrows=totalrows, skiprows=startrow, header=None) newdf = pd.read_excel(str(excelpath), skiprows=startrow, header=None) # newdf.fillna(method='pad', inplace=True) # except Exception as e: # JSExcelHandler.errorlog("获取标准表头-{}".format(excelpath)) # pass if len(dflist) > 0: for olddf in dflist: if newdf.equals(olddf) == True: break else: dfmaplist[excelpath] = newdf dflist = list(dfmaplist.keys()) else: dfmaplist[excelpath] = newdf dflist = list(dfmaplist.values()) return dfmaplist
dfmaplist[excelpath] = newdf return dfmaplist if __name__ == '__main__': # # 配置文件路径 # path = os.path.abspath('..') + "/configureFile.txt" # 表示当前所处的文件夹上一级文件夹的绝对路径 # controller = JSETLController() # # View = JSConfigureView(path) # # view 层读取配置文件 # View.readConfigureFile() # # # Controller 调度执行读取模板表头文件 | 根据数据文件自动识别表头 # controller = JSETLController() # # 根据标准表头进行比对和抽取合并数据 # controller.ReadDataThenCompareAndExtract(View.configuremodel) path = '/Users/sun/Desktop/test/' list = JSExcelHandler.getPathFromRootFolder(path) controller = JSETLController() for path in list: readOpenXls, sheetnames, workpath = JSExcelHandler.OpenXls(path) for sheetname in sheetnames: rSheet = readOpenXls.sheet_by_name(sheetname) controller.findMinMergeCellRange(rSheet)
def nullVefify(row): if not JSExcelHandler.checkIsNull(row): return 'false' else: return ''
def pathCallback(self): datalist = JSExcelHandler.getPathFromRootFolder(self.result) self.comboxlist["values"] = datalist
df[temp] = df[word].apply(idCardVerification) for word in nulllist: temp = word + '空值校验' word = word.strip() if word in df.columns: df[temp] = df[word].apply(nullVefify) for word in datelist: if len(datelist) == 0: break # temp = word + '时间统一' word = word.strip() if word in df.columns: df[word] = df[word].apply(dateformated) # print(df) filename = os.path.split(sourthpath)[-1] dir = os.path.split(sourthpath)[0] newobjectpath = dir + '/' + filename.split('.', 1)[0] + '校验结果.xlsx' df.to_excel(newobjectpath) messagebox.showinfo("message", "校验完成\n输出路径为:%s" % (newobjectpath)) if __name__ == '__main__': # GUI 测试 # controller = JSVerifyController() # tkinter.mainloop() print(JSExcelHandler.checkIDNumber('51052119810109281X')) print(JSExcelHandler.checkIDNumber('61052419911020004X'))
def ReadDataThenCompareAndExtract(self, configuremodel): print("开始抽取合并数据 %s" % datetime.now()) # 存放表头的路径 datapath = configuremodel.datapath # 读取所有数据文件 datafilelist = [] datafilelist = JSExcelHandler.getPathFromRootFolder(datapath, datafilelist) if len(datafilelist) == 0: messagebox.showinfo("message", "未找到原始数据") else: # 存放合并结果的路径 resultfilepath = configuremodel.storepath # 类型是 map {路径 : df}, 这里是按照设置的起始行读取标准模板表头,默认是 0 headsmaplist = self.readStandardHeadFromFolder(configuremodel) if headsmaplist == {}: messagebox.showinfo("message", "未找到模板表头") else: for dfpath in headsmaplist.keys(): sum = [] # 拼接结果文件路径, 用 os.path win 和 linux 会有自己判断 filename = os.path.split(dfpath)[-1] startrow = 0 rangemap = configuremodel.validrange # 配置模板的起始降范式位置 if rangemap != {}: if filename in configuremodel.validrange: startrow = int(configuremodel.validrange[filename]) newfile = os.path.join(resultfilepath, filename) ''' 这里是为所有的标准模板预先建立数据抽取的结果文件 ''' try: # 降低表头的范式 valuelist, lastrows, keywords = self.lowerDimensionOfTitle(dfpath, startrow) # 创建文件及表头文件到result文件夹下为抽取该模板做准备 newfile, newsheetname = self.newfilesave(dfpath, newfile, valuelist, startrow) except Exception as e: JSExcelHandler.errorlog("降范式并且新建表头文件未抽取数据做准备-{}".format(dfpath)) pass # 遍历目标数据文件下所有 sheet 是否与标准模板匹配,如果匹配则进行数据抽取合并操作 # 对文件进行分组合并 # def add(work, lock): # for path in work: # print(path) # try: # readOpenXlsx, sheetnames, tempPath = JSExcelHandler().OpenXls(path) # for sheetname in sheetnames: # # print("校验模板表:" + str(dfpath) + "\n与数据表:" + path + '下工作表sheetname:' + sheetname + "对比") # # 这里不能单纯的将标题范围下的数据一并写入,要重新建立按 keywords 排列顺序的 dataframe # # 获取模板表头的行数,用于数据表中获取表头范围 # totalrows = headsmaplist[dfpath].shape[0] # # 根据标准表头获取数据里的表头进行比对 # titledf = pd.read_excel(path, sheet_name=sheetname, nrows=totalrows, skiprows=startrow, # header=None) # # 这里要进行最下层 node 的比对 # # if titledf.equals(headsmaplist[dfpath]): # # 抽取模块headsmaplist, 并对匹配上的表进行记录 # JSExcelHandler.excuteCheck("{} ---- {}".format(dfpath, path)) # # 抽取数据 dafaframe # df = pd.read_excel(path, sheetname, skiprows=lastrows, header=None, dtype='str') # filename = os.path.split(path)[-1] # df[r'来源文件'] = filename # lock.acquire() # # self.apendDataFrame(df, newfile, newsheetname, False) # sum.append(df) # lock.release() # # except Exception as e: # print(str(e)) # JSExcelHandler.errorlog(r"遍历目标数据文件下所有 sheet 与标准模板匹配后追加数据-{}".format(path)) # # threads = [] # lock = threading.Lock() # for i in range(0, len(datafilelist), 5): # work = datafilelist[i:i + 5] # # results.append(None) # enlarge the results list, so we have room for this thread's result # t = threading.Thread(target=add, args=(work, lock )) # t.start() # threads.append(t) # # for t in threads: # t.join() for path in datafilelist: print(path) try: readOpenXlsx, sheetnames, tempPath = JSExcelHandler().OpenXls(path) for sheetname in sheetnames: # print("校验模板表:" + str(dfpath) + "\n与数据表:" + path + '下工作表sheetname:' + sheetname + "对比") # 这里不能单纯的将标题范围下的数据一并写入,要重新建立按 keywords 排列顺序的 dataframe # 获取模板表头的行数,用于数据表中获取表头范围 totalrows = headsmaplist[dfpath].shape[0] # 根据标准表头获取数据里的表头进行比对 titledf = pd.read_excel(path, sheet_name=sheetname, nrows=totalrows, skiprows=startrow, header=None) # titledf.fillna(method='pad', inplace=True) # 这里要进行最下层 node 的比对 if titledf.equals(headsmaplist[dfpath]): # 抽取模块headsmaplist, 并对匹配上的表进行记录 JSExcelHandler.excuteCheck("{} ---- {}".format(dfpath, path)) # 抽取数据 dafafram df = pd.read_excel(path, sheetname, skiprows=lastrows, header=None) filename = os.path.split(path)[-1] df[r'来源文件'] = '''%s|%s''' % (filename, sheetname) sum.append(df) except Exception as e: JSExcelHandler.errorlog("降范式并且新建表头文件未抽取数据做准备-{}".format(dfpath)) pass res = pd.DataFrame([]) for tmpdf in sum: res = res.append(tmpdf) self.apendDataFrame(res, newfile, newsheetname, False) print("合并数据完成 %s" % datetime.now()) messagebox.showinfo("message", "合并完成\n输出路径为:%s" % (configuremodel.storepath))
def autoExtractSheetHead(self, configuremodel): """ 自动根据数据提取表头 :param configuremodel: 读取 view 层的 model """ print("开始抽取降范式的模板表头") datapath = configuremodel.datapath # 所有需要合并的表格数据 exceldatalist = JSExcelHandler.getPathFromRootFolder(datapath) dflist = [] for path in exceldatalist: try: readOpenXlsx, sheetnames, tempPath = JSExcelHandler().OpenXls(path) for sheetname in sheetnames: rSheet = readOpenXlsx.sheet_by_name(sheetname) # mergecells = rSheet.merged_cells resultCells = rSheet.merged_cells mergecells = [] # 添加扫描的行范围,默认为前 8 行进行表头的搜寻 rangerow = 8 # 根据原表中的 cellrange 确定合并单元格的范围, 筛选出 header 部分有合并单元格的情况 for index, cell in enumerate(resultCells): crlow, crhign, cclow, cchigh = cell if crlow < rangerow: mergecells.append(cell) # 默认第一行为表头 没有合并单元格的情况 if len(mergecells) == 0: headrow = 1 else: levelmap = self.createLevelMap(mergecells, rSheet) # print(path) dummy = self.createLevelTree(levelmap, rSheet) # 根据层级来生成最终的 valuelist, keywords # self.travel(dummy, valuelist, keywords, maxlevel) # print(keywords) maxlevel = max(levelmap.keys()) # 这里应该比较所有叶子节点的下边界 lowestnodes = levelmap[maxlevel] # print(lowestnodes) headrow = 0 for node in lowestnodes: if len(node.child) > 0: headrow = maxlevel + 2 # break rlow, rhign, clow, chigh = node.cellrange if (rhign > headrow): headrow = rhign # 找到每个 sheet 的表头范围进行和之前的对比 TODO 这里可以对提取出来的表头进行处理 curdf = pd.read_excel(path, sheetname, nrows=headrow, header=None) # 每个进行遍历对比,完全不同则添加到 list 中 flag = True for olddf in dflist: if curdf.equals(olddf): # JSExcelHandler.pathlog(path) flag = False pass # 第一次 if len(dflist) == 0: flag = True if flag: dflist.append(curdf) filename, suffix = JSExcelHandler.SplitPathReturnNameAndSuffix(path) # 这里要保存为 xlsx 为了兼容合并单元格的功能 objectpath = "{}/{}_{}.{}".format(configuremodel.headspath, filename, sheetname, 'xlsx') self.restoreHead(path, sheetname, objectpath, curdf, headrow) except Exception as e: print(str(e)) JSExcelHandler.errorlog("自动根据数据提取表头,原数据文件有问题-{}".format(path)) pass print("抽取模板表头完成")
#!/usr/bin/env python # coding=utf-8 # ======================================== # # Time: 3/26/21 12:31 PM # Author: Sun # Software: PyCharm # Description: # # # ======================================== from Tools.JSExcelHandler import JSExcelHandler import pandas as pd if __name__ == '__main__': list = JSExcelHandler.getPathFromRootFolder(r'C:\Users\sjj-001\Desktop\2021年处理数据源集合\未处理\2020年市级收支月报') reslist = [] for path in list: filename, suffix = JSExcelHandler.SplitPathReturnNameAndSuffix(path) df = pd.read_excel(path, sheet_name='YB01', skiprows=3) df = df.fillna('') sum = [] for index , row in df.iterrows(): str = row[1 list = [row[0], str, row[2], filename] sum.append(list) for index , row in df.iterrows(): str = row[4] list = [row[3], str, row[5], filename] sum.append(list) ttt = pd.DataFrame(sum, columns=['科目编码', '科目名称', '金额', '来源'])
#!/usr/bin/env python # coding=utf-8 # ======================================== # # Time: 3/26/21 12:31 PM # Author: Sun # Software: PyCharm # Description: # # # ======================================== from Tools.JSExcelHandler import JSExcelHandler import pandas as pd if __name__ == '__main__': list = JSExcelHandler.getPathFromRootFolder(r'C:\Users\sjj-001\Desktop\AUTOETLTEST\data') reslist = [] for path in list: df = pd.read_excel(path, sheet_name='YB01',skiprows=3) df = df.fillna('') sum = [] for index , row in df.iterrows(): str = row[1] list = [str, row[2], str.count(' ')] sum.append(list) for index , row in df.iterrows(): str = row[4] list = [str, row[5], str.count(' ')] sum.append(list) ttt = pd.DataFrame(sum, columns=['科目名称', '数值', 'count']) reslist.append(ttt)