def test_split_file_by_equal_parts(self): rows = 3 parts = 3 split_file_list, header_path, total_rows, file_size = split_file(self._prepare_data(rows), parts=parts, output_dir=self.output_dir) self.assertEqual(len(split_file_list), parts) self.assertIn('part', split_file_list[0][0]) self.assertIn('headers.csv', header_path) self.assertEqual(total_rows, rows) self.assertTrue(file_size > 0)
def test_split_file_by_equal_parts(self): rows = 3 parts = 3 split_file_list, header_path, total_rows, file_size = split_file( self._prepare_data(rows), parts=parts, output_dir=self.output_dir) self.assertEqual(len(split_file_list), parts) self.assertIn('part', split_file_list[0][0]) self.assertIn('headers.csv', header_path) self.assertEqual(total_rows, rows) self.assertTrue(file_size > 0)
def test_split_file_by_less_rows_than_parts(self): rows = 1 parts = 4 split_file_list, header_path, total_rows, file_size = split_file(self._prepare_data(rows), parts=parts, output_dir=self.output_dir) self.assertEqual(len(split_file_list), 1) self.assertIn('headers.csv', header_path) self.assertEqual(total_rows, rows) self.assertTrue(file_size > 0) # Check row counts self.assertEqual(split_file_list[0][1], rows) self.assertEqual(split_file_list[0][1], 1)
def test_split_file_by_row_limit_more_than_total(self): rows = 10 row_limit = 4 split_file_list, header_path, total_rows, file_size = split_file(self._prepare_data(rows), row_limit=row_limit, output_dir=self.output_dir) self.assertEqual(len(split_file_list), 3) self.assertIn('headers.csv', header_path) self.assertEqual(total_rows, rows) self.assertTrue(file_size > 0) # Check row counts self.assertEqual(split_file_list[0][1], 4) self.assertEqual(split_file_list[1][1], 4) self.assertEqual(split_file_list[2][1], 2)
def test_split_file_by_less_rows_than_parts(self): rows = 1 parts = 4 split_file_list, header_path, total_rows, file_size = split_file( self._prepare_data(rows), parts=parts, output_dir=self.output_dir) self.assertEqual(len(split_file_list), 1) self.assertIn('headers.csv', header_path) self.assertEqual(total_rows, rows) self.assertTrue(file_size > 0) # Check row counts self.assertEqual(split_file_list[0][1], rows) self.assertEqual(split_file_list[0][1], 1)
def test_split_file_by_even_rows_even_split(self): rows = 4 parts = 2 split_file_list, header_path, total_rows, file_size = split_file(self._prepare_data(rows), parts=parts, output_dir=self.output_dir) self.assertEqual(len(split_file_list), parts) self.assertIn('headers.csv', header_path) self.assertEqual(total_rows, rows) self.assertTrue(file_size > 0) # Check row counts self.assertEqual(split_file_list[0][1], 2) self.assertEqual(split_file_list[1][1], 2) # Check row starting counts self.assertEqual(split_file_list[0][2], 1) self.assertEqual(split_file_list[1][2], 3)
def test_split_file_by_even_rows_even_split(self): rows = 4 parts = 2 split_file_list, header_path, total_rows, file_size = split_file( self._prepare_data(rows), parts=parts, output_dir=self.output_dir) self.assertEqual(len(split_file_list), parts) self.assertIn('headers.csv', header_path) self.assertEqual(total_rows, rows) self.assertTrue(file_size > 0) # Check row counts self.assertEqual(split_file_list[0][1], 2) self.assertEqual(split_file_list[1][1], 2) # Check row starting counts self.assertEqual(split_file_list[0][2], 1) self.assertEqual(split_file_list[1][2], 3)
def test_split_file_by_row_limit_more_than_total(self): rows = 10 row_limit = 4 split_file_list, header_path, total_rows, file_size = split_file( self._prepare_data(rows), row_limit=row_limit, output_dir=self.output_dir) self.assertEqual(len(split_file_list), 3) self.assertIn('headers.csv', header_path) self.assertEqual(total_rows, rows) self.assertTrue(file_size > 0) # Check row counts self.assertEqual(split_file_list[0][1], 4) self.assertEqual(split_file_list[1][1], 4) self.assertEqual(split_file_list[2][1], 2)
def task(incoming_msg): ''' This is the celery task for splitting file ''' start_time = datetime.datetime.now() guid_batch = incoming_msg[mk.GUID_BATCH] parts = incoming_msg[mk.PARTS] load_type = incoming_msg[mk.LOAD_TYPE] tenant_directory_paths = incoming_msg[mk.TENANT_DIRECTORY_PATHS] expanded_dir = tenant_directory_paths[mk.EXPANDED] csv_file = get_file_type_from_dir('.csv', expanded_dir) subfiles_dir = tenant_directory_paths[mk.SUBFILES] # do actual work of splitting file split_file_tuple_list, header_file_path, \ totalrows, filesize = split_file(csv_file, parts=parts, output_dir=subfiles_dir) finish_time = datetime.datetime.now() spend_time = finish_time - start_time logger.info(task.name) logger.info("FILE_SPLITTER: Split <%s> into %i sub-files in %s" % (csv_file, parts, spend_time)) # Benchmark benchmark = BatchTableBenchmark( guid_batch, load_type, task.name, start_time, finish_time, size_records=totalrows, size_units=filesize, udl_phase_step_status=NotificationConstants.SUCCESS, task_id=str(task.request.id), tenant=incoming_msg[mk.TENANT_NAME]) benchmark.record_benchmark() # Outgoing message to be piped to the parallel file loader outgoing_msg = {} outgoing_msg.update(incoming_msg) outgoing_msg.update({ mk.SPLIT_FILE_LIST: split_file_tuple_list, mk.HEADER_FILE_PATH: header_file_path, mk.SIZE_RECORDS: totalrows }) return outgoing_msg
def task(incoming_msg): ''' This is the celery task for splitting file ''' start_time = datetime.datetime.now() guid_batch = incoming_msg[mk.GUID_BATCH] parts = incoming_msg[mk.PARTS] load_type = incoming_msg[mk.LOAD_TYPE] tenant_directory_paths = incoming_msg[mk.TENANT_DIRECTORY_PATHS] expanded_dir = tenant_directory_paths[mk.EXPANDED] csv_file = get_file_type_from_dir('.csv', expanded_dir) subfiles_dir = tenant_directory_paths[mk.SUBFILES] # do actual work of splitting file split_file_tuple_list, header_file_path, \ totalrows, filesize = split_file(csv_file, parts=parts, output_dir=subfiles_dir) finish_time = datetime.datetime.now() spend_time = finish_time - start_time logger.info(task.name) logger.info("FILE_SPLITTER: Split <%s> into %i sub-files in %s" % (csv_file, parts, spend_time)) # Benchmark benchmark = BatchTableBenchmark(guid_batch, load_type, task.name, start_time, finish_time, size_records=totalrows, size_units=filesize, udl_phase_step_status=NotificationConstants.SUCCESS, task_id=str(task.request.id), tenant=incoming_msg[mk.TENANT_NAME]) benchmark.record_benchmark() # Outgoing message to be piped to the parallel file loader outgoing_msg = {} outgoing_msg.update(incoming_msg) outgoing_msg.update({mk.SPLIT_FILE_LIST: split_file_tuple_list, mk.HEADER_FILE_PATH: header_file_path, mk.SIZE_RECORDS: totalrows }) return outgoing_msg