def test_header_check_files(self): """ Determine if files with no header are properly determined. """ from itertools import islice from dataprofiler.data_readers import data_utils # add some more files to the list to test the header detection # these files have some first lines which are not the header test_dir = os.path.join(test_root_path, 'data') file_with_header_and_authors = [ dict(path=os.path.join(test_dir, 'csv/sparse-first-and-last-column-header-and-author.txt'), count=6, delimiter=',', has_header=[1], num_columns=3, encoding='utf-8'), dict(path=os.path.join(test_dir, 'csv/sparse-first-and-last-column-header-and-author-description.txt'), count=6, delimiter=',', has_header=[3], num_columns=3, encoding='utf-8'), dict(path=os.path.join(test_dir, 'csv/sparse-first-and-last-column-empty-first-row.txt'), count=11, delimiter=',', has_header=[1], num_columns=3, encoding='utf-8'), ] input_file_names = self.input_file_names[:] input_file_names += file_with_header_and_authors for input_file in input_file_names: file_encoding = data_utils.detect_file_encoding(input_file['path']) with open(input_file['path'], encoding=file_encoding) as csvfile: data_as_str = ''.join(list(islice(csvfile, 5))) header_line = CSVData._guess_header_row(data_as_str, input_file['delimiter']) self.assertIn(header_line, input_file['has_header'], input_file['path'])
def test_file_UTF_encoding_detection(self): """ Tests the ability for `data_utils.detect_file_encoding` to detect the encoding of text files. This test is specifically for UTF-8, UTF-16, and UTF-32 of csv or JSON. :return: """ test_dir = os.path.join(test_root_path, 'data') input_files = [ dict(path=os.path.join(test_dir, 'csv/iris-utf-8.csv'), encoding="utf-8"), dict(path=os.path.join(test_dir, 'csv/iris-utf-16.csv'), encoding="utf-16"), dict(path=os.path.join(test_dir, 'csv/iris-utf-32.csv'), encoding="utf-32"), dict(path=os.path.join(test_dir, 'json/iris-utf-8.json'), encoding="utf-8"), dict(path=os.path.join(test_dir, 'json/iris-utf-16.json'), encoding="utf-16"), dict(path=os.path.join(test_dir, 'json/iris-utf-32.json'), encoding="utf-32"), ] for input_file in input_files: detected_encoding = \ data_utils.detect_file_encoding(file_path=input_file["path"]) self.assertEqual(detected_encoding.lower(), input_file["encoding"])
def test_header_check_files(self): """ Determine if files with no header are properly determined. """ from itertools import islice from dataprofiler.data_readers import data_utils # add some more files to the list to test the header detection # these files have some first lines which are not the header input_file_names = self.input_file_names for input_file in input_file_names: file_encoding = data_utils.detect_file_encoding(input_file['path']) with open(input_file['path'], encoding=file_encoding) as csvfile: data_as_str = ''.join(list(islice(csvfile, 5))) header_line = CSVData._guess_header_row(data_as_str, input_file['delimiter']) self.assertIn(header_line, input_file['has_header'], input_file['path'])
def test_file_UTF_encoding_detection(self): """ Tests the ability for `data_utils.detect_file_encoding` to detect the encoding of text files. This test is specifically for UTF-8, UTF-16, and UTF-32 of csv or JSON. :return: """ test_dir = os.path.join(test_root_path, "data") input_files = [ dict(path=os.path.join(test_dir, "csv/iris-utf-8.csv"), encoding="utf-8"), dict(path=os.path.join(test_dir, "csv/iris-utf-16.csv"), encoding="utf-16"), dict(path=os.path.join(test_dir, "csv/iris-utf-32.csv"), encoding="utf-32"), dict(path=os.path.join(test_dir, "json/iris-utf-8.json"), encoding="utf-8"), dict(path=os.path.join(test_dir, "json/iris-utf-16.json"), encoding="utf-16"), dict(path=os.path.join(test_dir, "json/iris-utf-32.json"), encoding="utf-32"), dict(path=os.path.join(test_dir, "txt/utf8.txt"), encoding="utf-8"), dict(path=os.path.join(test_dir, "csv/zomato.csv"), encoding="ISO-8859-1"), dict(path=os.path.join(test_dir, "csv/reddit_wsb.csv"), encoding="utf-8"), ] get_match_acc = lambda s, s2: sum( [s[i] == s2[i] for i in range(len(s))]) / len(s) for input_file in input_files: detected_encoding = data_utils.detect_file_encoding( file_path=input_file["path"]) with open(input_file["path"], "rb") as infile: # Read a max of 1 MB of data content = infile.read(1024 * 1024) # Assert at least 99.9% of the content was correctly decoded match_acc = get_match_acc( content.decode(input_file["encoding"]), content.decode(detected_encoding), ) self.assertGreaterEqual(match_acc, 0.999)