def complex_file_union(self, choice): fp = common.get_floder(self.opt[choice]['title']) if fp is None: return skip_row = common.get_skip_row(self.opt[choice]['title']) print('\n---------------开始合并文件---------------\n') trans_list = common.trans_files(fp, self.transtypes) files_csv = common.get_files(fp, self.filetypes) if len(files_csv) == 0: raise Exception("无可合并文件!") savefn = path.join( fp, 'Result_UnionTable' + strftime("%Y%m%d%H%M%S", localtime()) + '.csv') title = self.__read_title(files_csv, skip_row) fsize = sum([path.getsize(x) for x in files_csv]) chunkline = 500000 havechunk = 0 for f in files_csv: encode = common.get_file_encode(f) df1 = common.trunk_file_byrow(f, encode, chunkline, skip_row) for d in df1: d_save = pd.concat([title, d], axis=0, sort=False) header = not path.exists(savefn) d_save.to_csv(savefn, mode='a', header=header, index=False, encoding='gbk') havechunk += path.getsize(f) common.print_rateofprogress(havechunk, fsize) if trans_list: [remove(x) for x in trans_list] print('\n\n---------------完成合并文件---------------\n') return True
def split_byrow(self, choice): fp = common.get_file(self.opt[choice]['title']) if not fp: return need_title = common.get_need_head(self.opt[choice]['title']) if need_title is None: return split_line = self.__get_split_row(self.opt[choice]['title']) if split_line is None: return skip_row = common.get_skip_row(self.opt[choice]['title']) if skip_row is None: return trance_list = [] print('\n---------------开始分割文件---------------\n') if fp.lower().endswith(self.transtypes): trance_list = common.trans_files(fp, self.transtypes) if not trance_list: raise Exception('文件内容为空') split_file_list = trance_list else: split_file_list = [fp] allsize = sum([path.getsize(x) for x in split_file_list]) # 当前所有文件的大小 currentsize = 0 # 当前所有文件已读取大小 for f in split_file_list: fid = 0 (savepath, fbasename) = path.split(f) fsize = path.getsize(f) # 当前文件的大小 cursize = 0 # 当前文件已读取大小 with open(f, mode='rb') as fobj: for i in range(skip_row): tmp = fobj.readline() currentsize += len(tmp) cursize += len(tmp) if need_title == 1: title = fobj.readline() currentsize += len(title) cursize += len(title) while cursize < fsize: fid += 1 (savename, saveextension) = path.splitext(fbasename) savename = savename + '_' + str(fid) + saveextension savefn = path.join(savepath, savename) fobj1 = open(savefn, 'wb') if need_title: fobj1.write(title) for i in range(split_line): line = fobj.readline() if not line: break fobj1.write(line) currentsize += len(line) cursize += len(line) common.print_rateofprogress(currentsize, allsize) fobj1.close() if trance_list: [remove(x) for x in trance_list] print('\n\n---------------完成分割文件---------------\n') return True
def split_bycolumn(self, choice): fp = common.get_file(self.opt[choice]['title']) if fp is None: return trans_list = [] print('\n---------------开始处理文件---------------\n') if fp.lower().endswith(self.transtypes): trans_list = common.trans_files(fp, self.transtypes) if trans_list is None: raise Exception('文件内容为空') split_file_list = trans_list else: split_file_list = [fp] chunkline = 500000 for f in split_file_list: (savepath, fbasename) = path.split(f) (savename, saveextension) = path.splitext(fbasename) skip_row = common.get_skip_row(self.opt[choice]['title']) if skip_row is None: return split_header = self.__get_split_header(f, self.opt[choice]['title'], skip_row) if not split_header: return encode = common.get_file_encode(f) for t in common.trunk_file_byrow(f, encode, chunkline, skip_row): print(f'------正在生成文件------') t_group = t.groupby(by=split_header) for index, value in t_group: savename_tail = str(index) if len( split_header) == 1 else '_'.join( [str(x) for x in index]) savename_tail = sub(r"[\/\\\:\*\?\"\<\>\|.]", "_", savename_tail) new_savename = savename + '_' + savename_tail + '.csv' savefn = path.join(savepath, new_savename) print(f'...{savefn}') value.to_csv(savefn, index=False, encoding='gbk', mode='a', header=True) if trans_list: [remove(x) for x in trans_list] print('\n\n---------------完成分割文件---------------\n') return True
def simple_file_union(self, choice): fp = common.get_floder(self.opt[choice]['title']) if fp is None: return need_title = common.get_need_head(self.opt[choice]['title']) if need_title is None: return skip_row = common.get_skip_row(self.opt[choice]['title']) print('\n---------------开始合并文件---------------\n') trans_list = common.trans_files(fp, self.transtypes) # 将excel转换为csv files_csv = common.get_files(fp, self.filetypes) # 查找目录下的所有文本文件 if len(files_csv) == 0: raise Exception('无可合并文件!') savefn = path.join( fp, 'Result_UnionTable' + strftime("%Y%m%d%H%M%S", localtime()) + '.csv') chunksize = 100 * 1024 * 1024 fsize = sum([path.getsize(x) for x in files_csv]) havechunk = 0 title = None buf = None lenbuf = None havetitle = False with open(savefn, 'ab+') as f0: for f in files_csv: for title, buf, lenbuf in common.trunk_csv_bysize( f, need_title, chunksize, skip_row): if need_title == 1: if not havetitle: f0.write(title) havetitle = True if lenbuf < chunksize: if not buf.endswith(b'\r\n'): buf += b'\r\n' lenbuf += 2 f0.write(buf) f0.flush() havechunk += lenbuf common.print_rateofprogress(havechunk, fsize) common.print_rateofprogress(fsize, fsize) if trans_list: [remove(x) for x in trans_list] print('\n\n---------------完成文件合并---------------\n') return True
def split_bysize(self, choice): fp = common.get_file(self.opt[choice]['title']) if fp is None: return need_title = common.get_need_head(self.opt[choice]['title']) if need_title is None: return split_size = self.__get_split_size(self.opt[choice]['title']) if split_size is None: return skip_row = common.get_skip_row(self.opt[choice]['title']) if skip_row is None: return trance_list = [] print('\n---------------开始分割文件---------------\n') if fp.lower().endswith(self.transtypes): trance_list = common.trans_files(fp, self.transtypes) if not trance_list: raise Exception('文件内容为空...') split_file_list = trance_list else: split_file_list = [fp] chunksize = 50 * 1024 * 1024 allsize = sum([path.getsize(x) for x in split_file_list]) currentsize = 0 # 记录整体已分割大小 for f in split_file_list: fid = 0 (savepath, fbasename) = path.split(f) fsize = path.getsize(f) with open(f, mode='rb') as fobj: for i in range(skip_row): tmp = fobj.readline() currentsize += len(tmp) fsize -= len(tmp) if need_title == 1: title = fobj.readline() currentsize += len(title) fsize -= len(title) chunknum = split_size // chunksize # 计算chunk次数 splitnum = ceil(fsize / split_size) # 计算分割次数 for i in range(splitnum): fid += 1 (savename, saveextension) = path.splitext(fbasename) savename = savename + '_' + str(fid) + saveextension savefn = path.join(savepath, savename) fobj1 = open(savefn, 'wb') if need_title: fobj1.write(title) havechunk = 0 for j in range(chunknum): lines = fobj.readlines(chunksize) fobj1.writelines(lines) fobj1.flush() len_lines = sum([len(x) for x in lines]) havechunk += len_lines currentsize += len_lines common.print_rateofprogress(currentsize, allsize) if split_size - havechunk > 0: # 剩余不足chunk一次的,再将剩余的读取一次 lines = fobj.readlines(split_size - havechunk) fobj1.writelines(lines) currentsize += sum([len(x) for x in lines]) common.print_rateofprogress(currentsize, allsize) fobj1.flush() fobj1.close() if trance_list: [remove(x) for x in trance_list] print('\n\n---------------完成分割文件---------------\n') return True