예제 #1
0
    def _split_helper(self,
                      filename,
                      split_size,
                      has_header=False,
                      dos_adjust=False):

        splitter = File_Splitter(filename, has_header)

        count = 0
        part_total_size = 0
        part_total_count = 0

        for (part_name, line_count) in splitter.splitfile(split_size):
            splitter_part = File_Splitter(part_name)
            part_count = LineCounter(part_name).line_count
            self.assertEqual(part_count, line_count)
            part_total_count = part_total_count + part_count
            os.unlink(part_name)

        lc = LineCounter(filename)

        if has_header:
            self.assertEqual(part_total_count, lc.line_count - 1)
        else:
            self.assertEqual(part_total_count, lc.line_count)
예제 #2
0
 def test_autosplit_file(self):
     self.assertEqual(
         File_Splitter(f("data/AandE_Data_2011-04-10.csv")).file_type(),
         FileType.DOS)
     self._auto_split_helper(f("data/fourlines.txt"),
                             4,
                             2,
                             has_header=False)
     self._auto_split_helper(f("data/ninelines.txt"), 9, 3, has_header=True)
     self._auto_split_helper(f("data/inventory.csv"), 5, 2, has_header=True)
     self._auto_split_helper(f("data/AandE_Data_2011-04-10.csv"),
                             301,
                             3,
                             has_header=True,
                             dos_adjust=True)
     self._auto_split_helper(f("data/AandE_Data_2011-04-10.csv"),
                             301,
                             2,
                             has_header=True,
                             dos_adjust=True)
     self._auto_split_helper(f("data/AandE_Data_2011-04-10.csv"),
                             301,
                             1,
                             has_header=True,
                             dos_adjust=True)
     self._auto_split_helper(f("data/AandE_Data_2011-04-10.csv"),
                             301,
                             0,
                             has_header=True,
                             dos_adjust=True)
     self._auto_split_helper(f("data/10k.txt"), 10000, 5, has_header=True)
     self._auto_split_helper(f("data/yellow_tripdata_2015-01-06-1999.csv"),
                             1999,
                             4,
                             has_header=False)
예제 #3
0
    def _auto_split_helper(self,
                           filename,
                           lines,
                           split_count,
                           has_header=False,
                           dos_adjust=False):

        splitter = File_Splitter(filename, has_header=has_header)
        part_total_count = 0
        total_line_count = splitter.line_count
        self.assertEqual(total_line_count, lines)
        for (part_name, line_count) in splitter.autosplit(split_count):
            part_count = LineCounter(part_name).line_count
            self.assertGreater(part_count, 0)
            self.assertEqual(part_count, line_count)
            part_total_count = part_total_count + part_count
            os.unlink(part_name)

        if has_header:
            self.assertEqual(part_total_count, lines - 1)
        else:
            self.assertEqual(part_total_count, lines)
예제 #4
0
 def _compare_input_output(self,
                           input_filename,
                           output_filenames,
                           has_header=False):
     original_count = 0
     file_piece_count = 0
     with open(input_filename, "r") as original_file:
         if has_header:
             _ = original_file.readline()
         for filename in File_Splitter.shim_names(output_filenames):
             with open(filename, "r") as file_piece:
                 for line in file_piece:
                     left = original_file.readline()
                     original_count = original_count + 1
                     right = line
                     file_piece_count = file_piece_count + 1
                     self.assertEqual(left, right)
             os.unlink(filename)
예제 #5
0
 def test_copy_file(self):
     splitter = File_Splitter(f("data/AandE_Data_2011-04-10.csv"),
                              has_header=True)
     self.assertEqual(splitter.file_type(), FileType.DOS)
     (_, total_lines) = splitter.copy_file(
         f("data/AandE_Data_2011-04-10.csv") + ".1", ignore_header=True)
예제 #6
0
 def test_count_lines(self):
     self.assertEqual(3, File_Splitter(f("data/threelines.txt")).line_count)
     self.assertEqual(0, File_Splitter(f("data/emptyfile.txt")).line_count)
     self.assertEqual(4, File_Splitter(f("data/fourlines.txt")).line_count)
     self.assertEqual(5, File_Splitter(f("data/inventory.csv")).line_count)
예제 #7
0
 def test_get_average_line_size(self):
     self.assertEqual(
         10,
         File_Splitter(f("data/tenlines.txt")).get_average_line_size())
예제 #8
0
def split_file_main(*argv):
    usage_message = '''
    
Split a text file into seperate pieces. if you specify 
autosplit then the program will use the first ten lines 
to calcuate an average line os_size and use that to 
determine the rough number of splits.

if you use **--splitsize** then the file will be split 
using **--splitsize** chunks until it is consumed.
'''

    parser = argparse.ArgumentParser(usage=usage_message)

    parser.add_argument('-v", '
                        '--version',
                        action='version',
                        version='%(prog)s ' + __VERSION__)
    parser.add_argument(
        "--autosplit",
        type=int,
        help=
        "split file based on loooking at the first ten lines and overall file os_size [default : %(default)s]"
    )
    parser.add_argument(
        '--hasheader',
        default=False,
        action="store_true",
        help=
        "Ignore header when calculating splits, don't include header in output"
    )
    parser.add_argument('--delimiter',
                        default=",",
                        help="Delimiter for fields[default : %(default)s] ")
    parser.add_argument("--splitsize",
                        type=int,
                        help="Split file into chunks of this os_size")
    parser.add_argument('--verbose',
                        default=False,
                        action="store_true",
                        help="Print out what is happening")
    parser.add_argument("filenames", nargs="*", help='list of files')
    args = parser.parse_args(*argv)

    if len(args.filenames) == 0:
        print("No input file specified to split")
        sys.exit(0)

    files = []

    for source in args.filenames:

        if not os.path.isfile(source):
            print(f"No such input file:'{source}'")
            continue

        splitter = File_Splitter(source, args.hasheader)
        # if splitter.has_header:
        #     print(f"{source} has a header line")

        if args.autosplit:
            if args.verbose:
                print(
                    f"Autosplitting: '{source}' into approximately {args.autosplit} parts"
                )
            for name, size in splitter.autosplit(args.autosplit):
                files.append((name, size))
        else:
            if args.verbose:
                print("Splitting '%s' using %i splitsize" %
                      (args.filenames[0], args.splitsize))
            for name, size in splitter.splitfile(args.splitsize):
                files.append((name, size))

        # print( "Split '%s' into %i parts"  % ( args.filenames[ 0 ], len( files )))

        #print(f"{source} has {splitter.line_count}")
        count = 1
        total_size = 0
        original_lines = splitter.line_count
        total_new_lines = 0

        results = list(files)
        for name, lines in results:
            total_new_lines = total_new_lines + lines
            if args.verbose:
                print(f"{count:4}. '{name:20}'. Lines : {lines:6}")

            count = count + 1
        if len(files) > 1:
            if args.verbose:
                print(f"{source} {original_lines:16}")

        # if len(files) > 1:
        #     if args.verbose:
        #         print("{} {:16} {:17}".format(" " * (len(i) + 7), total_lines, total_size))

        if splitter.has_header:
            print("Has_header")
            original_lines = original_lines - 1
        if files and (total_new_lines != original_lines):
            raise ValueError(f"Lines of '{source}' and total lines of pieces"\
                             f"{files}"
                             f"\ndo not match:"
                             f"\noriginal_lines : {original_lines}"
                             f"\npieces lines   : {total_new_lines}")

    return results