def test_csv_null_values() -> None: csv = textwrap.dedent("""\ a,b,c na,b,c a,na,c """) f = io.StringIO(csv) df = pl.read_csv(f, null_values="na") assert df[0, "a"] is None assert df[1, "b"] is None csv = textwrap.dedent("""\ a,b,c na,b,c a,n/a,c """) f = io.StringIO(csv) df = pl.read_csv(f, null_values=["na", "n/a"]) assert df[0, "a"] is None assert df[1, "b"] is None csv = textwrap.dedent("""\ a,b,c na,b,c a,n/a,c """) f = io.StringIO(csv) df = pl.read_csv(f, null_values={"a": "na", "b": "n/a"}) assert df[0, "a"] is None assert df[1, "b"] is None
def test_column_rename_and_dtype_overwrite() -> None: csv = """ a,b,c 1,2,3 1,2,3 """ f = io.StringIO(csv) df = pl.read_csv( f, new_columns=["A", "B", "C"], dtypes={"A": pl.Utf8, "B": pl.Int64, "C": pl.Float32}, ) assert df.dtypes == [pl.Utf8, pl.Int64, pl.Float32] f = io.StringIO(csv) df = pl.read_csv( f, columns=["a", "c"], new_columns=["A", "C"], dtypes={"A": pl.Utf8, "C": pl.Float32}, ) assert df.dtypes == [pl.Utf8, pl.Float32] csv = """ 1,2,3 1,2,3 """ f = io.StringIO(csv) df = pl.read_csv( f, new_columns=["A", "B", "C"], dtypes={"A": pl.Utf8, "C": pl.Float32}, has_header=False, ) assert df.dtypes == [pl.Utf8, pl.Int64, pl.Float32]
def test_csv_globbing() -> None: path = os.path.abspath( os.path.join( os.path.dirname(__file__), "..", "..", "examples", "aggregate_multiple_files_in_chunks", "datasets", "*.csv", )) df = pl.read_csv(path) assert df.shape == (135, 4) with pytest.raises(ValueError): _ = pl.read_csv(path, columns=[0, 1]) df = pl.read_csv(path, columns=["category", "sugars_g"]) assert df.shape == (135, 2) assert df.row(-1) == ("seafood", 1) assert df.row(0) == ("vegetables", 2) with pytest.raises(ValueError): _ = pl.read_csv(path, dtypes=[pl.Utf8, pl.Int64, pl.Int64, pl.Int64]) dtypes = { "category": pl.Utf8, "calories": pl.Int32, "fats_g": pl.Float32, "sugars_g": pl.Int32, } df = pl.read_csv(path, dtypes=dtypes) assert df.dtypes == list(dtypes.values())
def test_csv_globbing(examples_dir: str) -> None: path = os.path.abspath(os.path.join( examples_dir, "*.csv", )) df = pl.read_csv(path) assert df.shape == (135, 4) with pytest.raises(ValueError): _ = pl.read_csv(path, columns=[0, 1]) df = pl.read_csv(path, columns=["category", "sugars_g"]) assert df.shape == (135, 2) assert df.row(-1) == ("seafood", 1) assert df.row(0) == ("vegetables", 2) with pytest.raises(ValueError): _ = pl.read_csv(path, dtypes=[pl.Utf8, pl.Int64, pl.Int64, pl.Int64]) dtypes = { "category": pl.Utf8, "calories": pl.Int32, "fats_g": pl.Float32, "sugars_g": pl.Int32, } df = pl.read_csv(path, dtypes=dtypes) assert df.dtypes == list(dtypes.values())
def test_csv_null_values(): csv = """ a,b,c na,b,c a,na,c""" f = io.StringIO(csv) df = pl.read_csv(f, null_values="na") assert df[0, "a"] is None assert df[1, "b"] is None csv = """ a,b,c na,b,c a,n/a,c""" f = io.StringIO(csv) df = pl.read_csv(f, null_values=["na", "n/a"]) assert df[0, "a"] is None assert df[1, "b"] is None csv = """ a,b,c na,b,c a,n/a,c""" f = io.StringIO(csv) df = pl.read_csv(f, null_values={"a": "na", "b": "n/a"}) assert df[0, "a"] is None assert df[1, "b"] is None
def test_csv_date_handling() -> None: csv = """date 1745-04-02 1742-03-21 1743-06-16 1730-07-22 "" 1739-03-16 """ expected = pl.DataFrame( { "date": [ date(1745, 4, 2), date(1742, 3, 21), date(1743, 6, 16), date(1730, 7, 22), None, date(1739, 3, 16), ] } ) out = pl.read_csv(csv.encode(), parse_dates=True) assert out.frame_equal(expected, null_equal=True) dtypes = {"date": pl.Date} out = pl.read_csv(csv.encode(), dtypes=dtypes) assert out.frame_equal(expected, null_equal=True)
def parse_data_column_info(headers, data_s, sep, columns_number, columns_info=None): if columns_info is None: col = list( map( lambda x: utils.parse_column_info( headers, x, MAP_QUANTITY_NUMBER_COLUMN_NAME_BORE), range(1, columns_number + 1), )) return pl.read_csv( io.StringIO(data_s), sep=sep, new_columns=col, has_headers=False, projection=list(range(0, len(col))), ) else: return pl.read_csv( io.StringIO(data_s), sep=sep, new_columns=columns_info, has_headers=False, )
def test_csv_write_escape_newlines() -> None: df = pl.DataFrame(dict(escape=["n\nn"])) f = io.BytesIO() df.write_csv(f) f.seek(0) read_df = pl.read_csv(f) assert df.frame_equal(read_df)
def test_csv_string_escaping() -> None: df = pl.DataFrame({"a": ["Free trip to A,B", '''Special rate "1.79"''']}) f = io.BytesIO() df.write_csv(f) f.seek(0) df_read = pl.read_csv(f) assert df_read.frame_equal(df)
def test_read_csv_categorical() -> None: f = io.BytesIO() f.write( b"col1,col2,col3,col4,col5,col6\n'foo',2,3,4,5,6\n'bar',8,9,10,11,12") f.seek(0) df = pl.read_csv(f, has_header=True, dtypes={"col1": pl.Categorical}) assert df["col1"].dtype == pl.Categorical
def test_empty_string_missing_round_trip() -> None: df = pl.DataFrame({"varA": ["A", "", None], "varB": ["B", "", None]}) f = io.BytesIO() df.write_csv(f) f.seek(0) df_read = pl.read_csv(f) assert df.frame_equal(df_read)
def fallback_chrono_parser() -> None: data = """date_1,date_2 2021-01-01,2021-1-1 2021-02-02,2021-2-2 2021-10-10,2021-10-10""" assert pl.read_csv(data.encode(), parse_dates=True).null_count().row(0) == (0, 0)
def parse_data(headers, data_s, column_names=None): separator = utils.find_separator(headers) # Remove multiple whitespaces # TODO: find a way for polars to handle columns with variable amounts of whitespace if separator == " ": new_data = re.sub("[ \t]+", " ", data_s.replace("!", "")) else: # If we have another separator remove all whitespace around it new_data = re.sub( f"[\t ]*{re.escape(separator)}[\t ]*", separator, data_s.replace(separator + "!", "").replace("!", ""), ) # Remove whitespace at the beginning and end of lines, and remove the # last trailing line new_data = "\n".join([line.strip() for line in new_data.splitlines()]).rstrip() return pl.read_csv( new_data.encode(), sep=separator, new_columns=column_names, has_headers=False, )
def get_genotype_iter_vars_file(imputation_run_name, vars_fname, samples): with open(vars_fname) as vars_file: next(vars_file) try: next(vars_file) except StopIteration: # this is a empty vars file # yield only the list of details fields and then exit without yielding variants itr = load_and_filter_genotypes.load_strs(imputation_run_name, f'1:1-1', samples) yield next(itr) return f = pl.read_csv(vars_file, sep='\t') chroms = f['chrom'] poses = f['pos'] first = True for (chrom, pos) in zip(chroms, poses): itr = load_and_filter_genotypes.load_strs(imputation_run_name, f'{chrom}:{pos}-{pos}', samples) # yield or skip the extra details line if first: yield next(itr) first = False else: next(itr) # yield the genotype yield next(itr)
def test_to_polars_dataframe(report): # This relies entirely on Arrow, so as long as those tests work, # these should too. df = report.to_polars() df_csv = pl.read_csv(MOCK_CSV_PATH) df_csv = df_csv.with_column( pl.col("day").str.strptime(pl.Date).cast(pl.Datetime)) assert df.frame_equal(df_csv)
def test_compressed_csv() -> None: # gzip compression csv = """ a,b,c 1,a,1.0 2,b,2.0, 3,c,3.0 """ fout = io.BytesIO() with gzip.GzipFile(fileobj=fout, mode="w") as f: f.write(csv.encode()) csv_bytes = fout.getvalue() out = pl.read_csv(csv_bytes) expected = pl.DataFrame({ "a": [1, 2, 3], "b": ["a", "b", "c"], "c": [1.0, 2.0, 3.0] }) assert out.frame_equal(expected) # now from disk csv_file = Path(__file__).parent.parent / "files" / "gzipped.csv" out = pl.read_csv(str(csv_file)) assert out.frame_equal(expected) # now with column projection out = pl.read_csv(csv_bytes, columns=["a", "b"]) expected = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) assert out.frame_equal(expected) # zlib compression csv_bytes = zlib.compress(csv.encode()) out = pl.read_csv(csv_bytes) expected = pl.DataFrame({ "a": [1, 2, 3], "b": ["a", "b", "c"], "c": [1.0, 2.0, 3.0] }) assert out.frame_equal(expected) # no compression f2 = io.BytesIO(b"a,b\n1,2\n") out2 = pl.read_csv(f2) expected = pl.DataFrame({"a": [1], "b": [2]}) assert out2.frame_equal(expected)
def build_tissue_synonym_df(tissue_file, output_dir): # Get metadata file and tissue_df (assume that tissue_df is also in output_dir) tissue_metadata = pl.read_csv(tissue_file) # will read NA as string! tissue_df = pl.from_arrow( fread(os.path.join(output_dir, "tissue.jay")).to_arrow()) dataset_df = pl.from_arrow( fread(os.path.join(output_dir, "dataset.jay")).to_arrow()) # Find all columns relevant to tissueid tissue_cols = [ col for col in tissue_metadata.columns if re.match(".*tissueid$", col) and col != "unique.tissueid" ] # Read in which datasets we are working with dataset_names = os.listdir("procdata") clean_dataset_names = [re.sub("_.*$", "", name) for name in dataset_names] dataset_regex = re.compile("|".join(clean_dataset_names)) # Filter the cellid columns to only valid datasets tissue_columns = [ name for name in tissue_cols if re.match(dataset_regex, name) ] # Get all unique synonyms and join with cell_df tissue_meta_long = tissue_metadata \ .melt(id_vars="unique.tissueid", value_vars=tissue_columns) \ .drop_nulls() \ .drop_duplicates() \ .rename({"value": "tissue_name", "variable": "dataset_id"}) tissue_synonym_df = tissue_df \ .join(tissue_meta_long, left_on="name", right_on="unique.tissueid", how="left") \ .drop("name") \ .rename({"id": "tissue_id"}) \ .filter(col("tissue_name") != "") \ .drop_duplicates() \ .drop_nulls() # Create a map from dataset dataset_map = { dct["name"]: str(dct["id"]) for dct in dataset_df.to_pandas().to_dict(orient="record") } # Regex the dataset identifiers to match the dataset map tissue_synonym_df["dataset_id"] = tissue_synonym_df["dataset_id"] \ .apply(lambda x: re.sub("\.cellid$|[_.].*$", "", x)) \ .apply(lambda x: re.sub("GDSC$", "GDSC_v2", x)) \ .apply(lambda x: re.sub("GDSC1.*$", "GDSC_v1", x)) \ .apply(lambda x: dataset_map[x]) \ .cast(pl.Int64) tissue_synonym_df = tissue_synonym_df.drop_duplicates() tissue_synonym_df["id"] = range(1, tissue_synonym_df.shape[0] + 1) # Convert to datatable.Frame for fast write to disk tissue_synonym_dt = dt.Frame(tissue_synonym_df.to_arrow()) tissue_synonym_dt.to_jay(os.path.join(output_dir, "tissue_synonym.jay"))
def build_compound_synonym_df(compound_file, output_dir): # Get metadata file and compound_df compound_metadata = pl.read_csv(compound_file, null_values="NA") compound_df = pl.from_arrow( fread(os.path.join(output_dir, "compound.jay")).to_arrow()) dataset_df = pl.from_arrow( fread(os.path.join(output_dir, "dataset.jay")).to_arrow()) # Find all columns relevant to tissueid compound_cols = [ col for col in compound_metadata.columns if re.match(".*drugid$", col) and col != "unique.drugid" ] # Read in which datasets we are working with dataset_names = os.listdir("procdata") clean_dataset_names = [re.sub("_.*$", "", name) for name in dataset_names] dataset_regex = re.compile("|".join(clean_dataset_names)) # Filter the cellid columns to only valid datasets compound_columns = [ name for name in compound_cols if re.match(dataset_regex, name) ] # Get all unique synonyms and join with cell_df compound_meta_long = compound_metadata \ .melt(id_vars="unique.drugid", value_vars=compound_columns) \ .drop_nulls() \ .drop_duplicates() \ .rename({"value": "compound_name", "variable": "dataset_id"}) \ .filter(col("compound_name") != "") compound_synonym_df = compound_df \ .join(compound_meta_long, left_on="name", right_on="unique.drugid", how="left") \ .rename({"id": "compound_id"}) \ .select(["compound_id", "dataset_id", "compound_name"]) \ .drop_nulls() \ .drop_duplicates() # Create a map from dataset dataset_map = { dct["name"]: str(dct["id"]) for dct in dataset_df.to_pandas().to_dict(orient="record") } # Regex the dataset identifiers to match the dataset map compound_synonym_df["dataset_id"] = compound_synonym_df["dataset_id"] \ .apply(lambda x: re.sub("\.drugid$|[_.].*$", "", x)) \ .apply(lambda x: re.sub("GDSC2019", "GDSC_v2", x)) \ .apply(lambda x: re.sub("GDSC1.*$", "GDSC_v1", x)) \ .apply(lambda x: dataset_map[x]) \ .cast(pl.Int64) compound_synonym_df = compound_synonym_df.drop_duplicates() compound_synonym_df["id"] = range(1, compound_synonym_df.shape[0] + 1) # Convert to datatable.Frame for memory mapped output file df = dt.Frame(compound_synonym_df.to_arrow()) df.to_jay(os.path.join(output_dir, "compound_synonym.jay"))
def test_compressed_csv(): # gzip compression csv = """ a,b,c 1,a,1.0 2,b,2.0, 3,c,3.0 """ fout = io.BytesIO() with gzip.GzipFile(fileobj=fout, mode="w") as f: f.write(csv.encode()) csv_bytes = fout.getvalue() out = pl.read_csv(csv_bytes) expected = pl.DataFrame({ "a": [1, 2, 3], "b": ["a", "b", "c"], "c": [1.0, 2.0, 3.0] }) assert out.frame_equal(expected) # now from disk out = pl.read_csv("tests/files/gzipped.csv") assert out.frame_equal(expected) # now with column projection out = pl.read_csv(csv_bytes, columns=["a", "b"]) expected = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) assert out.frame_equal(expected) # zlib compression csv_bytes = zlib.compress(csv.encode()) out = pl.read_csv(csv_bytes) expected = pl.DataFrame({ "a": [1, 2, 3], "b": ["a", "b", "c"], "c": [1.0, 2.0, 3.0] }) assert out.frame_equal(expected) # no compression f = io.BytesIO(b"a, b\n1,2\n") out = pl.read_csv(f) expected = pl.DataFrame({"a": [1], "b": [2]}) assert out.frame_equal(expected)
def test_dtype_overwrite_with_column_name_selection() -> None: csv = textwrap.dedent("""\ a,b,c,d 1,2,3,4 1,2,3,4 """) f = io.StringIO(csv) df = pl.read_csv(f, columns=["c", "b", "d"], dtypes=[pl.Int32, pl.Utf8]) assert df.dtypes == [pl.Utf8, pl.Int32, pl.Int64]
def test_partial_dtype_overwrite(): csv = """ a,b,c 1,2,3 1,2,3 """ f = io.StringIO(csv) df = pl.read_csv(f, dtype=[pl.Utf8]) assert df.dtypes == [pl.Utf8, pl.Int64, pl.Int64]
def test_partial_dtype_overwrite() -> None: csv = textwrap.dedent("""\ a,b,c 1,2,3 1,2,3 """) f = io.StringIO(csv) df = pl.read_csv(f, dtypes=[pl.Utf8]) assert df.dtypes == [pl.Utf8, pl.Int64, pl.Int64]
def test_different_eol_char() -> None: csv = "a,1,10;b,2,20;c,3,30" expected = pl.DataFrame({ "column_1": ["a", "b", "c"], "column_2": [1, 2, 3], "column_3": [10, 20, 30] }) assert pl.read_csv(csv.encode(), eol_char=";", has_header=False).frame_equal(expected)
def test_read_csv_buffer_ownership() -> None: buf = io.BytesIO(b"\xf0\x9f\x98\x80,5.55,333\n\xf0\x9f\x98\x86,-5.0,666") df = pl.read_csv( buf, has_header=False, new_columns=["emoji", "flt", "int"], ) # confirm that read_csv succeeded, and didn't close the input buffer (#2696) assert df.shape == (2, 3) assert not buf.closed
def test_read_csv_columns_argument(col_input: Union[List[int], List[str]], col_out: List[str]) -> None: csv = """a,b,c 1,2,3 1,2,3 """ f = io.StringIO(csv) df = pl.read_csv(f, columns=col_input) assert df.shape[0] == 2 assert df.columns == col_out
def test_partial_decompression(foods_csv: str) -> None: fout = io.BytesIO() with open(foods_csv, "rb") as fread: with gzip.GzipFile(fileobj=fout, mode="w") as f: f.write(fread.read()) csv_bytes = fout.getvalue() for n_rows in [1, 5, 26]: out = pl.read_csv(csv_bytes, n_rows=n_rows) assert out.shape == (n_rows, 4)
def test_invalid_utf8() -> None: np.random.seed(1) bts = bytes(np.random.randint(0, 255, 200)) file = path.join(path.dirname(__file__), "nonutf8.csv") with open(file, "wb") as f: f.write(bts) a = pl.read_csv(file, has_headers=False, encoding="utf8-lossy") b = pl.scan_csv(file, has_headers=False, encoding="utf8-lossy").collect() assert a.frame_equal(b, null_equal=True)
def test_partial_column_rename(): csv = """ a,b,c 1,2,3 1,2,3 """ f = io.StringIO(csv) for use in [True, False]: f.seek(0) df = pl.read_csv(f, new_columns=["foo"], use_pyarrow=use) assert df.columns == ["foo", "b", "c"]
def test_csv_whitepsace_delimiter_at_end_do_not_skip() -> None: csv = "0\t1\t\t\t\t" assert pl.read_csv(csv.encode(), sep="\t", has_header=False).to_dict(False) == { "column_1": [0], "column_2": [1], "column_3": [None], "column_4": [None], "column_5": [None], "column_6": [None], }
def test_to_from_buffer(df: pl.DataFrame) -> None: buf = io.BytesIO() df.write_csv(buf) buf.seek(0) read_df = pl.read_csv(buf, parse_dates=True) read_df = read_df.with_columns( [pl.col("cat").cast(pl.Categorical), pl.col("time").cast(pl.Time)] ) assert df.frame_equal(read_df)