def _split_helper(self, filename, split_size, has_header=False, dos_adjust=False): splitter = File_Splitter(filename, has_header) count = 0 part_total_size = 0 part_total_count = 0 for (part_name, line_count) in splitter.splitfile(split_size): splitter_part = File_Splitter(part_name) part_count = LineCounter(part_name).line_count self.assertEqual(part_count, line_count) part_total_count = part_total_count + part_count os.unlink(part_name) lc = LineCounter(filename) if has_header: self.assertEqual(part_total_count, lc.line_count - 1) else: self.assertEqual(part_total_count, lc.line_count)
def test_autosplit_file(self): self.assertEqual( File_Splitter(f("data/AandE_Data_2011-04-10.csv")).file_type(), FileType.DOS) self._auto_split_helper(f("data/fourlines.txt"), 4, 2, has_header=False) self._auto_split_helper(f("data/ninelines.txt"), 9, 3, has_header=True) self._auto_split_helper(f("data/inventory.csv"), 5, 2, has_header=True) self._auto_split_helper(f("data/AandE_Data_2011-04-10.csv"), 301, 3, has_header=True, dos_adjust=True) self._auto_split_helper(f("data/AandE_Data_2011-04-10.csv"), 301, 2, has_header=True, dos_adjust=True) self._auto_split_helper(f("data/AandE_Data_2011-04-10.csv"), 301, 1, has_header=True, dos_adjust=True) self._auto_split_helper(f("data/AandE_Data_2011-04-10.csv"), 301, 0, has_header=True, dos_adjust=True) self._auto_split_helper(f("data/10k.txt"), 10000, 5, has_header=True) self._auto_split_helper(f("data/yellow_tripdata_2015-01-06-1999.csv"), 1999, 4, has_header=False)
def _auto_split_helper(self, filename, lines, split_count, has_header=False, dos_adjust=False): splitter = File_Splitter(filename, has_header=has_header) part_total_count = 0 total_line_count = splitter.line_count self.assertEqual(total_line_count, lines) for (part_name, line_count) in splitter.autosplit(split_count): part_count = LineCounter(part_name).line_count self.assertGreater(part_count, 0) self.assertEqual(part_count, line_count) part_total_count = part_total_count + part_count os.unlink(part_name) if has_header: self.assertEqual(part_total_count, lines - 1) else: self.assertEqual(part_total_count, lines)
def _compare_input_output(self, input_filename, output_filenames, has_header=False): original_count = 0 file_piece_count = 0 with open(input_filename, "r") as original_file: if has_header: _ = original_file.readline() for filename in File_Splitter.shim_names(output_filenames): with open(filename, "r") as file_piece: for line in file_piece: left = original_file.readline() original_count = original_count + 1 right = line file_piece_count = file_piece_count + 1 self.assertEqual(left, right) os.unlink(filename)
def test_copy_file(self): splitter = File_Splitter(f("data/AandE_Data_2011-04-10.csv"), has_header=True) self.assertEqual(splitter.file_type(), FileType.DOS) (_, total_lines) = splitter.copy_file( f("data/AandE_Data_2011-04-10.csv") + ".1", ignore_header=True)
def test_count_lines(self): self.assertEqual(3, File_Splitter(f("data/threelines.txt")).line_count) self.assertEqual(0, File_Splitter(f("data/emptyfile.txt")).line_count) self.assertEqual(4, File_Splitter(f("data/fourlines.txt")).line_count) self.assertEqual(5, File_Splitter(f("data/inventory.csv")).line_count)
def test_get_average_line_size(self): self.assertEqual( 10, File_Splitter(f("data/tenlines.txt")).get_average_line_size())
def split_file_main(*argv): usage_message = ''' Split a text file into seperate pieces. if you specify autosplit then the program will use the first ten lines to calcuate an average line os_size and use that to determine the rough number of splits. if you use **--splitsize** then the file will be split using **--splitsize** chunks until it is consumed. ''' parser = argparse.ArgumentParser(usage=usage_message) parser.add_argument('-v", ' '--version', action='version', version='%(prog)s ' + __VERSION__) parser.add_argument( "--autosplit", type=int, help= "split file based on loooking at the first ten lines and overall file os_size [default : %(default)s]" ) parser.add_argument( '--hasheader', default=False, action="store_true", help= "Ignore header when calculating splits, don't include header in output" ) parser.add_argument('--delimiter', default=",", help="Delimiter for fields[default : %(default)s] ") parser.add_argument("--splitsize", type=int, help="Split file into chunks of this os_size") parser.add_argument('--verbose', default=False, action="store_true", help="Print out what is happening") parser.add_argument("filenames", nargs="*", help='list of files') args = parser.parse_args(*argv) if len(args.filenames) == 0: print("No input file specified to split") sys.exit(0) files = [] for source in args.filenames: if not os.path.isfile(source): print(f"No such input file:'{source}'") continue splitter = File_Splitter(source, args.hasheader) # if splitter.has_header: # print(f"{source} has a header line") if args.autosplit: if args.verbose: print( f"Autosplitting: '{source}' into approximately {args.autosplit} parts" ) for name, size in splitter.autosplit(args.autosplit): files.append((name, size)) else: if args.verbose: print("Splitting '%s' using %i splitsize" % (args.filenames[0], args.splitsize)) for name, size in splitter.splitfile(args.splitsize): files.append((name, size)) # print( "Split '%s' into %i parts" % ( args.filenames[ 0 ], len( files ))) #print(f"{source} has {splitter.line_count}") count = 1 total_size = 0 original_lines = splitter.line_count total_new_lines = 0 results = list(files) for name, lines in results: total_new_lines = total_new_lines + lines if args.verbose: print(f"{count:4}. '{name:20}'. Lines : {lines:6}") count = count + 1 if len(files) > 1: if args.verbose: print(f"{source} {original_lines:16}") # if len(files) > 1: # if args.verbose: # print("{} {:16} {:17}".format(" " * (len(i) + 7), total_lines, total_size)) if splitter.has_header: print("Has_header") original_lines = original_lines - 1 if files and (total_new_lines != original_lines): raise ValueError(f"Lines of '{source}' and total lines of pieces"\ f"{files}" f"\ndo not match:" f"\noriginal_lines : {original_lines}" f"\npieces lines : {total_new_lines}") return results