def test_h2o3_smalldata(f): ignored_files = { # Zip files containing >1 files os.path.join("gbm_test", "bank-full.csv.zip"), os.path.join("jira", "pub-999.zip"), os.path.join("parser", "hexdev_497", "airlines_no_first_header.zip"), os.path.join("parser", "hexdev_497", "airlines_first_header.zip"), os.path.join("parser", "hexdev_497", "airlines_small_csv.zip"), os.path.join("prostate", "prostate.bin.csv.zip"), os.path.join("smalldata", "images", "cat_dog_tiny_thumbnails.zip"), # Others os.path.join("arff", "folder1", "iris0.csv"), os.path.join("jira", "pubdev_2897.csv"), os.path.join("jira", "runit_pubdev_3590_unexpected_column.csv"), os.path.join("junit", "iris.xls.zip"), os.path.join("junit", "test_parse_mix.csv"), os.path.join("junit", "arff", "jm1_arff.txt"), os.path.join("junit", "arff", "jm1.arff.txt"), os.path.join("merge", "livestock.nuts.csv"), os.path.join("merge", "tourism.csv"), os.path.join("parser", "column.csv"), } if any(ff in f for ff in ignored_files): pytest.skip("On the ignored files list") else: params = {} if is_ppc64(): params["nthreads"] = 8 pytest.skip("Fread tests disabled on PPC64") return if "test_pubdev3589" in f: params["sep"] = "\n" if ("single_quotes_mixed.csv" in f or "single_quotes_with_escaped_quotes.csv" in f or "single_quotes_with_escaped_quotes_custom_escapechar.csv" in f): params["quotechar"] = "'" with warnings.catch_warnings(): warnings.simplefilter("ignore") DT = dt.fread(f, **params) frame_integrity_check(DT)
def test_h2o3_bigdata(f): ignored_files = { # Feather files os.path.join("ipums_feather.gz"), # empty files os.path.join("mnist", "t10k-images-idx3-ubyte.gz"), os.path.join("mnist", "t10k-labels-idx1-ubyte.gz"), os.path.join("mnist", "train-images-idx3-ubyte.gz"), os.path.join("mnist", "train-labels-idx1-ubyte.gz"), # ARFF files os.path.join("parser", "anARFFFile.txt"), # zip files having more than 1 file inside os.path.join("flights-nyc", "delays14.csv.zip"), os.path.join("flights-nyc", "flights14.csv.zip"), os.path.join("flights-nyc", "weather_delays14.csv.zip"), os.path.join("images", "demo_disney_data.zip"), # jpegs... os.path.join("jira", "la1s.wc.arff.txt.zip"), os.path.join("jira", "re0.wc.arff.txt.zip"), os.path.join("jira", "rotterdam.csv.zip"), os.path.join("parser", "hexdev_497", "milsongs_csv.zip"), os.path.join("glm", "GLM_model_python_1543520565753_1.zip"), os.path.join("glm", "GLM_model_python_1543520565753_3.zip"), os.path.join("glm", "GLM_model_python_1544561074878_1.zip"), # requires `comment` parameter os.path.join("new-poker-hand.full.311M.txt"), # files with 36M columns os.path.join("testng", "newsgroup_train1.csv"), os.path.join("testng", "newsgroup_validation1.csv"), # broken CRC zip files os.path.join("jira", "tenThousandCat50C.csv.zip"), os.path.join("jira", "tenThousandCat100C.csv.zip"), os.path.join("parser", "year2005.csv.gz"), } filledna_files = { os.path.join("lending-club", "LoanStats3a.csv"), os.path.join("lending-club", "LoanStats3b.csv"), os.path.join("lending-club", "LoanStats3c.csv"), os.path.join("lending-club", "LoanStats3d.csv"), os.path.join("LoanStats3a.csv"), os.path.join("LoanStats3b.csv"), os.path.join("LoanStats3c.csv"), os.path.join("LoanStats3d.csv"), os.path.join("Kaggle_Product_BO_Test_v2.csv.zip"), os.path.join("Kaggle_Product_BO_Training_v2.csv.zip"), } if any(ff in f for ff in ignored_files): pytest.skip("On the ignored files list") return params = {"memory_limit": MEMORY_LIMIT} if is_ppc64(): params["nthreads"] = 8 pytest.skip("Fread tests disabled on PPC64") return if any(ff in f for ff in filledna_files): params["fill"] = True if "imagenet/cat_dog_mouse.tgz" in f: f = os.path.join(f, "cat_dog_mouse.csv") with warnings.catch_warnings(): warnings.simplefilter("ignore") DT = dt.fread(f, **params) frame_integrity_check(DT)