def test_stdout_logging_and_csv_module_fail(capsys): with UI(None, logging.DEBUG, stdout=True) as ui: data = 'tests/fixtures/unparsable.csv' exc = str("""[ERROR] The csv module failed to detect the CSV """ + """dialect. Try giving hints with the --delimiter """ + """argument, E.g --delimiter=','""") msg = ('{}\nIf you need assistance please send the output of this ' 'script to [email protected].').format(exc) with mock.patch('datarobot_batch_scoring.utils.os._exit') as m_exit: with pytest.raises(csv.Error): investigate_encoding_and_dialect(data, None, ui) m_exit.assert_called_with(1) out, err = capsys.readouterr() assert msg in out.strip('\n')
def test_investigate_encoding_and_dialect(): with UI(None, logging.DEBUG, stdout=False) as ui: data = 'tests/fixtures/windows_encoded.csv' encoding = investigate_encoding_and_dialect(data, None, ui) dialect = csv.get_dialect('dataset_dialect') assert encoding == 'iso-8859-2' assert dialect.lineterminator == '\r\n' assert dialect.quotechar == '"' assert dialect.delimiter == ','
def test_investigate_encoding_and_dialect_submit_encoding(): with UI(None, logging.DEBUG, stdout=False) as ui: with mock.patch('datarobot_batch_scoring.utils.chardet.detect') as cd: data = 'tests/fixtures/windows_encoded.csv' encoding = investigate_encoding_and_dialect(data, None, ui, fast=False, encoding='iso-8859-2', skip_dialect=False) assert encoding == 'iso-8859-2' assert not cd.called
def test_investigate_encoding_and_dialect_substitute_delimiter(): with UI(None, logging.DEBUG, stdout=False) as ui: with mock.patch('datarobot_batch_scoring.utils.csv.Sniffer') as sn: data = 'tests/fixtures/windows_encoded.csv' encoding = investigate_encoding_and_dialect(data, '|', ui, fast=False, encoding='utf-8', skip_dialect=True) assert encoding == 'utf-8' # Intentionally wrong assert not sn.called dialect = csv.get_dialect('dataset_dialect') assert dialect.delimiter == '|'
def test_investigate_encoding_and_dialect_skip_dialect(): with UI(None, logging.DEBUG, stdout=False) as ui: with mock.patch('datarobot_batch_scoring.utils.csv.Sniffer') as sn: data = 'tests/fixtures/windows_encoded.csv' encoding = investigate_encoding_and_dialect(data, None, ui, fast=False, encoding='', skip_dialect=True) assert encoding == 'iso-8859-2' assert not sn.called dialect = csv.get_dialect('dataset_dialect') assert dialect.delimiter == ','
def test_auto_small_dataset(): with UI(None, logging.DEBUG, stdout=False) as ui: data = 'tests/fixtures/regression_jp.csv.gz' encoding = investigate_encoding_and_dialect(data, None, ui) assert auto_sampler(data, encoding, ui) == 500
def test_auto_sample(): with UI(None, logging.DEBUG, stdout=False) as ui: data = 'tests/fixtures/criteo_top30_1m.csv.gz' encoding = investigate_encoding_and_dialect(data, None, ui) assert auto_sampler(data, encoding, ui) == 14980 ui.close()