예제 #1
0
 def complex_file_union(self, choice):
     fp = common.get_floder(self.opt[choice]['title'])
     if fp is None: return
     skip_row = common.get_skip_row(self.opt[choice]['title'])
     print('\n---------------开始合并文件---------------\n')
     trans_list = common.trans_files(fp, self.transtypes)
     files_csv = common.get_files(fp, self.filetypes)
     if len(files_csv) == 0: raise Exception("无可合并文件!")
     savefn = path.join(
         fp, 'Result_UnionTable' + strftime("%Y%m%d%H%M%S", localtime()) +
         '.csv')
     title = self.__read_title(files_csv, skip_row)
     fsize = sum([path.getsize(x) for x in files_csv])
     chunkline = 500000
     havechunk = 0
     for f in files_csv:
         encode = common.get_file_encode(f)
         df1 = common.trunk_file_byrow(f, encode, chunkline, skip_row)
         for d in df1:
             d_save = pd.concat([title, d], axis=0, sort=False)
             header = not path.exists(savefn)
             d_save.to_csv(savefn,
                           mode='a',
                           header=header,
                           index=False,
                           encoding='gbk')
         havechunk += path.getsize(f)
         common.print_rateofprogress(havechunk, fsize)
     if trans_list: [remove(x) for x in trans_list]
     print('\n\n---------------完成合并文件---------------\n')
     return True
예제 #2
0
 def split_byrow(self, choice):
     fp = common.get_file(self.opt[choice]['title'])
     if not fp: return
     need_title = common.get_need_head(self.opt[choice]['title'])
     if need_title is None: return
     split_line = self.__get_split_row(self.opt[choice]['title'])
     if split_line is None: return
     skip_row = common.get_skip_row(self.opt[choice]['title'])
     if skip_row is None: return
     trance_list = []
     print('\n---------------开始分割文件---------------\n')
     if fp.lower().endswith(self.transtypes):
         trance_list = common.trans_files(fp, self.transtypes)
         if not trance_list: raise Exception('文件内容为空')
         split_file_list = trance_list
     else:
         split_file_list = [fp]
     allsize = sum([path.getsize(x) for x in split_file_list])  # 当前所有文件的大小
     currentsize = 0  # 当前所有文件已读取大小
     for f in split_file_list:
         fid = 0
         (savepath, fbasename) = path.split(f)
         fsize = path.getsize(f)  # 当前文件的大小
         cursize = 0  # 当前文件已读取大小
         with open(f, mode='rb') as fobj:
             for i in range(skip_row):
                 tmp = fobj.readline()
                 currentsize += len(tmp)
                 cursize += len(tmp)
             if need_title == 1:
                 title = fobj.readline()
                 currentsize += len(title)
                 cursize += len(title)
             while cursize < fsize:
                 fid += 1
                 (savename, saveextension) = path.splitext(fbasename)
                 savename = savename + '_' + str(fid) + saveextension
                 savefn = path.join(savepath, savename)
                 fobj1 = open(savefn, 'wb')
                 if need_title: fobj1.write(title)
                 for i in range(split_line):
                     line = fobj.readline()
                     if not line: break
                     fobj1.write(line)
                     currentsize += len(line)
                     cursize += len(line)
                     common.print_rateofprogress(currentsize, allsize)
                 fobj1.close()
     if trance_list: [remove(x) for x in trance_list]
     print('\n\n---------------完成分割文件---------------\n')
     return True
예제 #3
0
 def split_bycolumn(self, choice):
     fp = common.get_file(self.opt[choice]['title'])
     if fp is None: return
     trans_list = []
     print('\n---------------开始处理文件---------------\n')
     if fp.lower().endswith(self.transtypes):
         trans_list = common.trans_files(fp, self.transtypes)
         if trans_list is None: raise Exception('文件内容为空')
         split_file_list = trans_list
     else:
         split_file_list = [fp]
     chunkline = 500000
     for f in split_file_list:
         (savepath, fbasename) = path.split(f)
         (savename, saveextension) = path.splitext(fbasename)
         skip_row = common.get_skip_row(self.opt[choice]['title'])
         if skip_row is None: return
         split_header = self.__get_split_header(f,
                                                self.opt[choice]['title'],
                                                skip_row)
         if not split_header: return
         encode = common.get_file_encode(f)
         for t in common.trunk_file_byrow(f, encode, chunkline, skip_row):
             print(f'------正在生成文件------')
             t_group = t.groupby(by=split_header)
             for index, value in t_group:
                 savename_tail = str(index) if len(
                     split_header) == 1 else '_'.join(
                         [str(x) for x in index])
                 savename_tail = sub(r"[\/\\\:\*\?\"\<\>\|.]", "_",
                                     savename_tail)
                 new_savename = savename + '_' + savename_tail + '.csv'
                 savefn = path.join(savepath, new_savename)
                 print(f'...{savefn}')
                 value.to_csv(savefn,
                              index=False,
                              encoding='gbk',
                              mode='a',
                              header=True)
     if trans_list: [remove(x) for x in trans_list]
     print('\n\n---------------完成分割文件---------------\n')
     return True
예제 #4
0
    def simple_file_union(self, choice):
        fp = common.get_floder(self.opt[choice]['title'])
        if fp is None: return
        need_title = common.get_need_head(self.opt[choice]['title'])
        if need_title is None: return
        skip_row = common.get_skip_row(self.opt[choice]['title'])
        print('\n---------------开始合并文件---------------\n')
        trans_list = common.trans_files(fp, self.transtypes)  # 将excel转换为csv
        files_csv = common.get_files(fp, self.filetypes)  # 查找目录下的所有文本文件
        if len(files_csv) == 0: raise Exception('无可合并文件!')
        savefn = path.join(
            fp, 'Result_UnionTable' + strftime("%Y%m%d%H%M%S", localtime()) +
            '.csv')
        chunksize = 100 * 1024 * 1024
        fsize = sum([path.getsize(x) for x in files_csv])
        havechunk = 0

        title = None
        buf = None
        lenbuf = None
        havetitle = False
        with open(savefn, 'ab+') as f0:
            for f in files_csv:
                for title, buf, lenbuf in common.trunk_csv_bysize(
                        f, need_title, chunksize, skip_row):
                    if need_title == 1:
                        if not havetitle:
                            f0.write(title)
                            havetitle = True
                    if lenbuf < chunksize:
                        if not buf.endswith(b'\r\n'):
                            buf += b'\r\n'
                            lenbuf += 2
                    f0.write(buf)
                    f0.flush()
                    havechunk += lenbuf
                    common.print_rateofprogress(havechunk, fsize)
        common.print_rateofprogress(fsize, fsize)
        if trans_list: [remove(x) for x in trans_list]
        print('\n\n---------------完成文件合并---------------\n')
        return True
예제 #5
0
 def split_bysize(self, choice):
     fp = common.get_file(self.opt[choice]['title'])
     if fp is None: return
     need_title = common.get_need_head(self.opt[choice]['title'])
     if need_title is None: return
     split_size = self.__get_split_size(self.opt[choice]['title'])
     if split_size is None: return
     skip_row = common.get_skip_row(self.opt[choice]['title'])
     if skip_row is None: return
     trance_list = []
     print('\n---------------开始分割文件---------------\n')
     if fp.lower().endswith(self.transtypes):
         trance_list = common.trans_files(fp, self.transtypes)
         if not trance_list: raise Exception('文件内容为空...')
         split_file_list = trance_list
     else:
         split_file_list = [fp]
     chunksize = 50 * 1024 * 1024
     allsize = sum([path.getsize(x) for x in split_file_list])
     currentsize = 0  # 记录整体已分割大小
     for f in split_file_list:
         fid = 0
         (savepath, fbasename) = path.split(f)
         fsize = path.getsize(f)
         with open(f, mode='rb') as fobj:
             for i in range(skip_row):
                 tmp = fobj.readline()
                 currentsize += len(tmp)
                 fsize -= len(tmp)
             if need_title == 1:
                 title = fobj.readline()
                 currentsize += len(title)
                 fsize -= len(title)
             chunknum = split_size // chunksize  # 计算chunk次数
             splitnum = ceil(fsize / split_size)  # 计算分割次数
             for i in range(splitnum):
                 fid += 1
                 (savename, saveextension) = path.splitext(fbasename)
                 savename = savename + '_' + str(fid) + saveextension
                 savefn = path.join(savepath, savename)
                 fobj1 = open(savefn, 'wb')
                 if need_title: fobj1.write(title)
                 havechunk = 0
                 for j in range(chunknum):
                     lines = fobj.readlines(chunksize)
                     fobj1.writelines(lines)
                     fobj1.flush()
                     len_lines = sum([len(x) for x in lines])
                     havechunk += len_lines
                     currentsize += len_lines
                     common.print_rateofprogress(currentsize, allsize)
                 if split_size - havechunk > 0:  # 剩余不足chunk一次的,再将剩余的读取一次
                     lines = fobj.readlines(split_size - havechunk)
                     fobj1.writelines(lines)
                     currentsize += sum([len(x) for x in lines])
                     common.print_rateofprogress(currentsize, allsize)
                     fobj1.flush()
                 fobj1.close()
     if trance_list: [remove(x) for x in trance_list]
     print('\n\n---------------完成分割文件---------------\n')
     return True