def test_split_file_by_equal_parts(self):
     rows = 3
     parts = 3
     split_file_list, header_path, total_rows, file_size = split_file(self._prepare_data(rows), parts=parts, output_dir=self.output_dir)
     self.assertEqual(len(split_file_list), parts)
     self.assertIn('part', split_file_list[0][0])
     self.assertIn('headers.csv', header_path)
     self.assertEqual(total_rows, rows)
     self.assertTrue(file_size > 0)
 def test_split_file_by_equal_parts(self):
     rows = 3
     parts = 3
     split_file_list, header_path, total_rows, file_size = split_file(
         self._prepare_data(rows), parts=parts, output_dir=self.output_dir)
     self.assertEqual(len(split_file_list), parts)
     self.assertIn('part', split_file_list[0][0])
     self.assertIn('headers.csv', header_path)
     self.assertEqual(total_rows, rows)
     self.assertTrue(file_size > 0)
 def test_split_file_by_less_rows_than_parts(self):
     rows = 1
     parts = 4
     split_file_list, header_path, total_rows, file_size = split_file(self._prepare_data(rows), parts=parts, output_dir=self.output_dir)
     self.assertEqual(len(split_file_list), 1)
     self.assertIn('headers.csv', header_path)
     self.assertEqual(total_rows, rows)
     self.assertTrue(file_size > 0)
     # Check row counts
     self.assertEqual(split_file_list[0][1], rows)
     self.assertEqual(split_file_list[0][1], 1)
 def test_split_file_by_row_limit_more_than_total(self):
     rows = 10
     row_limit = 4
     split_file_list, header_path, total_rows, file_size = split_file(self._prepare_data(rows), row_limit=row_limit, output_dir=self.output_dir)
     self.assertEqual(len(split_file_list), 3)
     self.assertIn('headers.csv', header_path)
     self.assertEqual(total_rows, rows)
     self.assertTrue(file_size > 0)
     # Check row counts
     self.assertEqual(split_file_list[0][1], 4)
     self.assertEqual(split_file_list[1][1], 4)
     self.assertEqual(split_file_list[2][1], 2)
 def test_split_file_by_less_rows_than_parts(self):
     rows = 1
     parts = 4
     split_file_list, header_path, total_rows, file_size = split_file(
         self._prepare_data(rows), parts=parts, output_dir=self.output_dir)
     self.assertEqual(len(split_file_list), 1)
     self.assertIn('headers.csv', header_path)
     self.assertEqual(total_rows, rows)
     self.assertTrue(file_size > 0)
     # Check row counts
     self.assertEqual(split_file_list[0][1], rows)
     self.assertEqual(split_file_list[0][1], 1)
 def test_split_file_by_even_rows_even_split(self):
     rows = 4
     parts = 2
     split_file_list, header_path, total_rows, file_size = split_file(self._prepare_data(rows), parts=parts, output_dir=self.output_dir)
     self.assertEqual(len(split_file_list), parts)
     self.assertIn('headers.csv', header_path)
     self.assertEqual(total_rows, rows)
     self.assertTrue(file_size > 0)
     # Check row counts
     self.assertEqual(split_file_list[0][1], 2)
     self.assertEqual(split_file_list[1][1], 2)
     # Check row starting counts
     self.assertEqual(split_file_list[0][2], 1)
     self.assertEqual(split_file_list[1][2], 3)
 def test_split_file_by_even_rows_even_split(self):
     rows = 4
     parts = 2
     split_file_list, header_path, total_rows, file_size = split_file(
         self._prepare_data(rows), parts=parts, output_dir=self.output_dir)
     self.assertEqual(len(split_file_list), parts)
     self.assertIn('headers.csv', header_path)
     self.assertEqual(total_rows, rows)
     self.assertTrue(file_size > 0)
     # Check row counts
     self.assertEqual(split_file_list[0][1], 2)
     self.assertEqual(split_file_list[1][1], 2)
     # Check row starting counts
     self.assertEqual(split_file_list[0][2], 1)
     self.assertEqual(split_file_list[1][2], 3)
 def test_split_file_by_row_limit_more_than_total(self):
     rows = 10
     row_limit = 4
     split_file_list, header_path, total_rows, file_size = split_file(
         self._prepare_data(rows),
         row_limit=row_limit,
         output_dir=self.output_dir)
     self.assertEqual(len(split_file_list), 3)
     self.assertIn('headers.csv', header_path)
     self.assertEqual(total_rows, rows)
     self.assertTrue(file_size > 0)
     # Check row counts
     self.assertEqual(split_file_list[0][1], 4)
     self.assertEqual(split_file_list[1][1], 4)
     self.assertEqual(split_file_list[2][1], 2)
def task(incoming_msg):
    '''
    This is the celery task for splitting file
    '''
    start_time = datetime.datetime.now()
    guid_batch = incoming_msg[mk.GUID_BATCH]
    parts = incoming_msg[mk.PARTS]
    load_type = incoming_msg[mk.LOAD_TYPE]
    tenant_directory_paths = incoming_msg[mk.TENANT_DIRECTORY_PATHS]
    expanded_dir = tenant_directory_paths[mk.EXPANDED]
    csv_file = get_file_type_from_dir('.csv', expanded_dir)

    subfiles_dir = tenant_directory_paths[mk.SUBFILES]

    # do actual work of splitting file
    split_file_tuple_list, header_file_path, \
        totalrows, filesize = split_file(csv_file, parts=parts, output_dir=subfiles_dir)

    finish_time = datetime.datetime.now()
    spend_time = finish_time - start_time

    logger.info(task.name)
    logger.info("FILE_SPLITTER: Split <%s> into %i sub-files in %s" %
                (csv_file, parts, spend_time))

    # Benchmark
    benchmark = BatchTableBenchmark(
        guid_batch,
        load_type,
        task.name,
        start_time,
        finish_time,
        size_records=totalrows,
        size_units=filesize,
        udl_phase_step_status=NotificationConstants.SUCCESS,
        task_id=str(task.request.id),
        tenant=incoming_msg[mk.TENANT_NAME])
    benchmark.record_benchmark()
    # Outgoing message to be piped to the parallel file loader
    outgoing_msg = {}
    outgoing_msg.update(incoming_msg)
    outgoing_msg.update({
        mk.SPLIT_FILE_LIST: split_file_tuple_list,
        mk.HEADER_FILE_PATH: header_file_path,
        mk.SIZE_RECORDS: totalrows
    })
    return outgoing_msg
def task(incoming_msg):
    '''
    This is the celery task for splitting file
    '''
    start_time = datetime.datetime.now()
    guid_batch = incoming_msg[mk.GUID_BATCH]
    parts = incoming_msg[mk.PARTS]
    load_type = incoming_msg[mk.LOAD_TYPE]
    tenant_directory_paths = incoming_msg[mk.TENANT_DIRECTORY_PATHS]
    expanded_dir = tenant_directory_paths[mk.EXPANDED]
    csv_file = get_file_type_from_dir('.csv', expanded_dir)

    subfiles_dir = tenant_directory_paths[mk.SUBFILES]

    # do actual work of splitting file
    split_file_tuple_list, header_file_path, \
        totalrows, filesize = split_file(csv_file, parts=parts, output_dir=subfiles_dir)

    finish_time = datetime.datetime.now()
    spend_time = finish_time - start_time

    logger.info(task.name)
    logger.info("FILE_SPLITTER: Split <%s> into %i sub-files in %s" % (csv_file, parts, spend_time))

    # Benchmark
    benchmark = BatchTableBenchmark(guid_batch, load_type, task.name, start_time, finish_time,
                                    size_records=totalrows, size_units=filesize, udl_phase_step_status=NotificationConstants.SUCCESS,
                                    task_id=str(task.request.id), tenant=incoming_msg[mk.TENANT_NAME])
    benchmark.record_benchmark()
    # Outgoing message to be piped to the parallel file loader
    outgoing_msg = {}
    outgoing_msg.update(incoming_msg)
    outgoing_msg.update({mk.SPLIT_FILE_LIST: split_file_tuple_list,
                         mk.HEADER_FILE_PATH: header_file_path,
                         mk.SIZE_RECORDS: totalrows
                         })
    return outgoing_msg