def test_i_walk_dir_for_filepaths_names(self, mock_walk): """ Ensure the file names paired with their paths are yielded for all the results returned by os.walk for the given root dir. """ def fake_walk(root_dir): """ Yield results like os.walk """ yield ("dir0", ["dir1"], ["cat", "hat", "bat"]) yield ("dir1", [], ["tin", "can"]) mock_walk.side_effect = fake_walk walker = i_walk_dir_for_filepaths_names("fake") self.assertEqual(next(walker), ("dir0/cat", "cat")) self.assertEqual(next(walker), ("dir0/hat", "hat")) self.assertEqual(next(walker), ("dir0/bat", "bat")) self.assertEqual(next(walker), ("dir1/tin", "tin")) self.assertEqual(next(walker), ("dir1/can", "can")) mock_walk.assert_called_once_with("fake")
def i_walk_csv_paths(input_dir): """ Generator to yield the paths of csv files in the input directory. :param input_dir: path to the input directory """ # Iterator of the filepaths and file names in the input directory file_path_names = i_walk_dir_for_filepaths_names(str(input_dir)) # Iterator of just the csv files. csv_file_path_names = ifilter(is_file_csv, file_path_names) # Generator function that will yield just the paths return yield_nth_of(0, csv_file_path_names)
def main(): """ Concatenate csv files together in no particular order. """ import pathlib input_dir = pathlib.Path("test_data/things_kinds") file_path_names = i_walk_dir_for_filepaths_names(str(input_dir)) csv_file_path_names = ifilter(karld.io.is_file_csv, file_path_names) out_prefix = "" out_dir = pathlib.Path("out_data/things_kinds") out_filename = "combined_things.csv" csv_files_to_file(chain.from_iterable, out_prefix, str(out_dir), out_filename, csv_file_path_names)
def serial_run_files_to_files(file_to_file, in_dir, filter_func=None): """ With a map files in in_dir over the file_to_file function. Using this to debug your file_to_file function can make it easier. :param file_to_file: callable that takes file paths. :param in_dir: path to process all files from. :param filter_func: Takes a tuple of path and base \ name of a file and returns a bool. :returns: A list of return values from the map. """ results = i_walk_dir_for_filepaths_names(in_dir) if filter_func: results_final = ifilter(filter_func, results) else: results_final = results return list(map(file_to_file, results_final))
def pool_run_files_to_files(file_to_file, in_dir, filter_func=None): """ With a multi-process pool, map files in in_dir over file_to_file function. :param file_to_file: callable that takes file paths. :param in_dir: path to process all files from. :param filter_func: Takes a tuple of path and base \ name of a file and returns a bool. :returns: A list of return values from the map. """ from concurrent.futures import ProcessPoolExecutor results = i_walk_dir_for_filepaths_names(in_dir) if filter_func: results_final = ifilter(filter_func, results) else: results_final = results with ProcessPoolExecutor() as pool: return list(pool.map(file_to_file, results_final))
def test_sort_merge_csv_files_to_file(self): """ Ensure csv_files_to_file will read multiple csv files and write one csv file with the contents as yielded from the given combiner function. Ensure i_walk_dir_for_filepaths_names produces the paths and basenames of the files in the test_data directory. """ from karld.run_together import csv_files_to_file out_dir = os.path.join(tempfile.gettempdir(), "karld_test_sort_merge") prefix = str(datetime.now()) out_filename = "things_combined.csv" input_dir = os.path.join(os.path.dirname(__file__), "test_data", "things_kinds") file_path_names = i_walk_dir_for_filepaths_names(input_dir) expected_file = os.path.join(out_dir, "{}{}".format(prefix, out_filename)) if os.path.exists(expected_file): os.remove(expected_file) csv_file_path_names = ifilter( is_file_csv, file_path_names) csv_files_to_file( combine_things, prefix, out_dir, out_filename, csv_file_path_names) self.assertTrue(os.path.exists(expected_file)) with open(expected_file) as result_file: contents = result_file.read() expected_lines = ['cat,animal', 'cheese,dairy', 'apple,fruit', 'orange,fruit', 'peach,fruit', 'pear,fruit', 'tomato,fruit', 'mushroom,fungus', 'iron,metal', 'titanium,metal', 'ruby,mineral', 'topaz,mineral', 'WĄŻ,utf-8 sample', 'dróżką,utf-8 sample', 'celery,vegetable'] lines = contents.splitlines() self.assertEqual(expected_lines, lines) if os.path.exists(expected_file): os.remove(expected_file)