Exemplo n.º 1
0
def test_csv_null_values() -> None:
    csv = textwrap.dedent("""\
        a,b,c
        na,b,c
        a,na,c
        """)
    f = io.StringIO(csv)

    df = pl.read_csv(f, null_values="na")
    assert df[0, "a"] is None
    assert df[1, "b"] is None

    csv = textwrap.dedent("""\
        a,b,c
        na,b,c
        a,n/a,c
        """)
    f = io.StringIO(csv)
    df = pl.read_csv(f, null_values=["na", "n/a"])
    assert df[0, "a"] is None
    assert df[1, "b"] is None

    csv = textwrap.dedent("""\
        a,b,c
        na,b,c
        a,n/a,c
        """)
    f = io.StringIO(csv)
    df = pl.read_csv(f, null_values={"a": "na", "b": "n/a"})
    assert df[0, "a"] is None
    assert df[1, "b"] is None
Exemplo n.º 2
0
def test_column_rename_and_dtype_overwrite() -> None:
    csv = """
a,b,c
1,2,3
1,2,3
"""
    f = io.StringIO(csv)
    df = pl.read_csv(
        f,
        new_columns=["A", "B", "C"],
        dtypes={"A": pl.Utf8, "B": pl.Int64, "C": pl.Float32},
    )
    assert df.dtypes == [pl.Utf8, pl.Int64, pl.Float32]

    f = io.StringIO(csv)
    df = pl.read_csv(
        f,
        columns=["a", "c"],
        new_columns=["A", "C"],
        dtypes={"A": pl.Utf8, "C": pl.Float32},
    )
    assert df.dtypes == [pl.Utf8, pl.Float32]

    csv = """
1,2,3
1,2,3
"""
    f = io.StringIO(csv)
    df = pl.read_csv(
        f,
        new_columns=["A", "B", "C"],
        dtypes={"A": pl.Utf8, "C": pl.Float32},
        has_header=False,
    )
    assert df.dtypes == [pl.Utf8, pl.Int64, pl.Float32]
Exemplo n.º 3
0
def test_csv_globbing() -> None:
    path = os.path.abspath(
        os.path.join(
            os.path.dirname(__file__),
            "..",
            "..",
            "examples",
            "aggregate_multiple_files_in_chunks",
            "datasets",
            "*.csv",
        ))
    df = pl.read_csv(path)
    assert df.shape == (135, 4)

    with pytest.raises(ValueError):
        _ = pl.read_csv(path, columns=[0, 1])

    df = pl.read_csv(path, columns=["category", "sugars_g"])
    assert df.shape == (135, 2)
    assert df.row(-1) == ("seafood", 1)
    assert df.row(0) == ("vegetables", 2)

    with pytest.raises(ValueError):
        _ = pl.read_csv(path, dtypes=[pl.Utf8, pl.Int64, pl.Int64, pl.Int64])

    dtypes = {
        "category": pl.Utf8,
        "calories": pl.Int32,
        "fats_g": pl.Float32,
        "sugars_g": pl.Int32,
    }

    df = pl.read_csv(path, dtypes=dtypes)
    assert df.dtypes == list(dtypes.values())
Exemplo n.º 4
0
def test_csv_globbing(examples_dir: str) -> None:
    path = os.path.abspath(os.path.join(
        examples_dir,
        "*.csv",
    ))
    df = pl.read_csv(path)
    assert df.shape == (135, 4)

    with pytest.raises(ValueError):
        _ = pl.read_csv(path, columns=[0, 1])

    df = pl.read_csv(path, columns=["category", "sugars_g"])
    assert df.shape == (135, 2)
    assert df.row(-1) == ("seafood", 1)
    assert df.row(0) == ("vegetables", 2)

    with pytest.raises(ValueError):
        _ = pl.read_csv(path, dtypes=[pl.Utf8, pl.Int64, pl.Int64, pl.Int64])

    dtypes = {
        "category": pl.Utf8,
        "calories": pl.Int32,
        "fats_g": pl.Float32,
        "sugars_g": pl.Int32,
    }

    df = pl.read_csv(path, dtypes=dtypes)
    assert df.dtypes == list(dtypes.values())
Exemplo n.º 5
0
def test_csv_null_values():
    csv = """
a,b,c
na,b,c
a,na,c"""
    f = io.StringIO(csv)

    df = pl.read_csv(f, null_values="na")
    assert df[0, "a"] is None
    assert df[1, "b"] is None

    csv = """
a,b,c
na,b,c
a,n/a,c"""
    f = io.StringIO(csv)
    df = pl.read_csv(f, null_values=["na", "n/a"])
    assert df[0, "a"] is None
    assert df[1, "b"] is None

    csv = """
a,b,c
na,b,c
a,n/a,c"""
    f = io.StringIO(csv)
    df = pl.read_csv(f, null_values={"a": "na", "b": "n/a"})
    assert df[0, "a"] is None
    assert df[1, "b"] is None
Exemplo n.º 6
0
def test_csv_date_handling() -> None:
    csv = """date
1745-04-02
1742-03-21
1743-06-16
1730-07-22
""
1739-03-16
"""
    expected = pl.DataFrame(
        {
            "date": [
                date(1745, 4, 2),
                date(1742, 3, 21),
                date(1743, 6, 16),
                date(1730, 7, 22),
                None,
                date(1739, 3, 16),
            ]
        }
    )
    out = pl.read_csv(csv.encode(), parse_dates=True)
    assert out.frame_equal(expected, null_equal=True)
    dtypes = {"date": pl.Date}
    out = pl.read_csv(csv.encode(), dtypes=dtypes)
    assert out.frame_equal(expected, null_equal=True)
Exemplo n.º 7
0
 def parse_data_column_info(headers,
                            data_s,
                            sep,
                            columns_number,
                            columns_info=None):
     if columns_info is None:
         col = list(
             map(
                 lambda x: utils.parse_column_info(
                     headers, x, MAP_QUANTITY_NUMBER_COLUMN_NAME_BORE),
                 range(1, columns_number + 1),
             ))
         return pl.read_csv(
             io.StringIO(data_s),
             sep=sep,
             new_columns=col,
             has_headers=False,
             projection=list(range(0, len(col))),
         )
     else:
         return pl.read_csv(
             io.StringIO(data_s),
             sep=sep,
             new_columns=columns_info,
             has_headers=False,
         )
Exemplo n.º 8
0
def test_csv_write_escape_newlines() -> None:
    df = pl.DataFrame(dict(escape=["n\nn"]))
    f = io.BytesIO()
    df.write_csv(f)
    f.seek(0)
    read_df = pl.read_csv(f)
    assert df.frame_equal(read_df)
Exemplo n.º 9
0
def test_csv_string_escaping() -> None:
    df = pl.DataFrame({"a": ["Free trip to A,B", '''Special rate "1.79"''']})
    f = io.BytesIO()
    df.write_csv(f)
    f.seek(0)
    df_read = pl.read_csv(f)
    assert df_read.frame_equal(df)
Exemplo n.º 10
0
def test_read_csv_categorical() -> None:
    f = io.BytesIO()
    f.write(
        b"col1,col2,col3,col4,col5,col6\n'foo',2,3,4,5,6\n'bar',8,9,10,11,12")
    f.seek(0)
    df = pl.read_csv(f, has_header=True, dtypes={"col1": pl.Categorical})
    assert df["col1"].dtype == pl.Categorical
Exemplo n.º 11
0
def test_empty_string_missing_round_trip() -> None:
    df = pl.DataFrame({"varA": ["A", "", None], "varB": ["B", "", None]})
    f = io.BytesIO()
    df.write_csv(f)
    f.seek(0)
    df_read = pl.read_csv(f)
    assert df.frame_equal(df_read)
Exemplo n.º 12
0
def fallback_chrono_parser() -> None:
    data = """date_1,date_2
    2021-01-01,2021-1-1
    2021-02-02,2021-2-2
    2021-10-10,2021-10-10"""
    assert pl.read_csv(data.encode(),
                       parse_dates=True).null_count().row(0) == (0, 0)
Exemplo n.º 13
0
    def parse_data(headers, data_s, column_names=None):
        separator = utils.find_separator(headers)

        # Remove multiple whitespaces
        # TODO: find a way for polars to handle columns with variable amounts of whitespace
        if separator == " ":
            new_data = re.sub("[ \t]+", " ", data_s.replace("!", ""))
        else:
            # If we have another separator remove all whitespace around it
            new_data = re.sub(
                f"[\t ]*{re.escape(separator)}[\t ]*",
                separator,
                data_s.replace(separator + "!", "").replace("!", ""),
            )

        # Remove whitespace at the beginning and end of lines, and remove the
        # last trailing line
        new_data = "\n".join([line.strip()
                              for line in new_data.splitlines()]).rstrip()

        return pl.read_csv(
            new_data.encode(),
            sep=separator,
            new_columns=column_names,
            has_headers=False,
        )
Exemplo n.º 14
0
def get_genotype_iter_vars_file(imputation_run_name, vars_fname, samples):
    with open(vars_fname) as vars_file:
        next(vars_file)
        try:
            next(vars_file)
        except StopIteration:
            # this is a empty vars file
            # yield only the list of details fields and then exit without yielding variants
            itr = load_and_filter_genotypes.load_strs(imputation_run_name,
                                                      f'1:1-1', samples)
            yield next(itr)
            return
    f = pl.read_csv(vars_file, sep='\t')
    chroms = f['chrom']
    poses = f['pos']
    first = True
    for (chrom, pos) in zip(chroms, poses):
        itr = load_and_filter_genotypes.load_strs(imputation_run_name,
                                                  f'{chrom}:{pos}-{pos}',
                                                  samples)
        # yield or skip the extra details line
        if first:
            yield next(itr)
            first = False
        else:
            next(itr)
        # yield the genotype
        yield next(itr)
Exemplo n.º 15
0
def test_to_polars_dataframe(report):
    # This relies entirely on Arrow, so as long as those tests work,
    # these should too.
    df = report.to_polars()
    df_csv = pl.read_csv(MOCK_CSV_PATH)
    df_csv = df_csv.with_column(
        pl.col("day").str.strptime(pl.Date).cast(pl.Datetime))
    assert df.frame_equal(df_csv)
Exemplo n.º 16
0
def test_compressed_csv() -> None:
    # gzip compression
    csv = """
a,b,c
1,a,1.0
2,b,2.0,
3,c,3.0
"""
    fout = io.BytesIO()
    with gzip.GzipFile(fileobj=fout, mode="w") as f:
        f.write(csv.encode())

    csv_bytes = fout.getvalue()
    out = pl.read_csv(csv_bytes)
    expected = pl.DataFrame({
        "a": [1, 2, 3],
        "b": ["a", "b", "c"],
        "c": [1.0, 2.0, 3.0]
    })
    assert out.frame_equal(expected)

    # now from disk
    csv_file = Path(__file__).parent.parent / "files" / "gzipped.csv"
    out = pl.read_csv(str(csv_file))
    assert out.frame_equal(expected)

    # now with column projection
    out = pl.read_csv(csv_bytes, columns=["a", "b"])
    expected = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
    assert out.frame_equal(expected)

    # zlib compression
    csv_bytes = zlib.compress(csv.encode())
    out = pl.read_csv(csv_bytes)
    expected = pl.DataFrame({
        "a": [1, 2, 3],
        "b": ["a", "b", "c"],
        "c": [1.0, 2.0, 3.0]
    })
    assert out.frame_equal(expected)

    # no compression
    f2 = io.BytesIO(b"a,b\n1,2\n")
    out2 = pl.read_csv(f2)
    expected = pl.DataFrame({"a": [1], "b": [2]})
    assert out2.frame_equal(expected)
Exemplo n.º 17
0
def build_tissue_synonym_df(tissue_file, output_dir):
    # Get metadata file and tissue_df (assume that tissue_df is also in output_dir)
    tissue_metadata = pl.read_csv(tissue_file)  # will read NA as string!
    tissue_df = pl.from_arrow(
        fread(os.path.join(output_dir, "tissue.jay")).to_arrow())
    dataset_df = pl.from_arrow(
        fread(os.path.join(output_dir, "dataset.jay")).to_arrow())

    # Find all columns relevant to tissueid
    tissue_cols = [
        col for col in tissue_metadata.columns
        if re.match(".*tissueid$", col) and col != "unique.tissueid"
    ]

    # Read in which datasets we are working with
    dataset_names = os.listdir("procdata")
    clean_dataset_names = [re.sub("_.*$", "", name) for name in dataset_names]
    dataset_regex = re.compile("|".join(clean_dataset_names))

    # Filter the cellid columns to only valid datasets
    tissue_columns = [
        name for name in tissue_cols if re.match(dataset_regex, name)
    ]

    # Get all unique synonyms and join with cell_df
    tissue_meta_long = tissue_metadata \
        .melt(id_vars="unique.tissueid", value_vars=tissue_columns) \
        .drop_nulls() \
        .drop_duplicates() \
        .rename({"value": "tissue_name", "variable": "dataset_id"})

    tissue_synonym_df = tissue_df \
        .join(tissue_meta_long, left_on="name", right_on="unique.tissueid", how="left") \
        .drop("name") \
        .rename({"id": "tissue_id"}) \
        .filter(col("tissue_name") != "") \
        .drop_duplicates() \
        .drop_nulls()

    # Create a map from dataset
    dataset_map = {
        dct["name"]: str(dct["id"])
        for dct in dataset_df.to_pandas().to_dict(orient="record")
    }
    # Regex the dataset identifiers to match the dataset map
    tissue_synonym_df["dataset_id"] = tissue_synonym_df["dataset_id"] \
        .apply(lambda x: re.sub("\.cellid$|[_.].*$", "", x)) \
        .apply(lambda x: re.sub("GDSC$", "GDSC_v2", x)) \
        .apply(lambda x: re.sub("GDSC1.*$", "GDSC_v1", x)) \
        .apply(lambda x: dataset_map[x]) \
        .cast(pl.Int64)

    tissue_synonym_df = tissue_synonym_df.drop_duplicates()
    tissue_synonym_df["id"] = range(1, tissue_synonym_df.shape[0] + 1)

    # Convert to datatable.Frame for fast write to disk
    tissue_synonym_dt = dt.Frame(tissue_synonym_df.to_arrow())
    tissue_synonym_dt.to_jay(os.path.join(output_dir, "tissue_synonym.jay"))
Exemplo n.º 18
0
def build_compound_synonym_df(compound_file, output_dir):
    # Get metadata file and compound_df
    compound_metadata = pl.read_csv(compound_file, null_values="NA")
    compound_df = pl.from_arrow(
        fread(os.path.join(output_dir, "compound.jay")).to_arrow())
    dataset_df = pl.from_arrow(
        fread(os.path.join(output_dir, "dataset.jay")).to_arrow())

    # Find all columns relevant to tissueid
    compound_cols = [
        col for col in compound_metadata.columns
        if re.match(".*drugid$", col) and col != "unique.drugid"
    ]

    # Read in which datasets we are working with
    dataset_names = os.listdir("procdata")
    clean_dataset_names = [re.sub("_.*$", "", name) for name in dataset_names]
    dataset_regex = re.compile("|".join(clean_dataset_names))

    # Filter the cellid columns to only valid datasets
    compound_columns = [
        name for name in compound_cols if re.match(dataset_regex, name)
    ]

    # Get all unique synonyms and join with cell_df
    compound_meta_long = compound_metadata \
        .melt(id_vars="unique.drugid", value_vars=compound_columns) \
        .drop_nulls() \
        .drop_duplicates() \
        .rename({"value": "compound_name", "variable": "dataset_id"}) \
        .filter(col("compound_name") != "")

    compound_synonym_df = compound_df \
        .join(compound_meta_long, left_on="name", right_on="unique.drugid", how="left") \
        .rename({"id": "compound_id"}) \
        .select(["compound_id", "dataset_id", "compound_name"]) \
        .drop_nulls() \
        .drop_duplicates()

    # Create a map from dataset
    dataset_map = {
        dct["name"]: str(dct["id"])
        for dct in dataset_df.to_pandas().to_dict(orient="record")
    }
    # Regex the dataset identifiers to match the dataset map
    compound_synonym_df["dataset_id"] = compound_synonym_df["dataset_id"] \
        .apply(lambda x: re.sub("\.drugid$|[_.].*$", "", x)) \
        .apply(lambda x: re.sub("GDSC2019", "GDSC_v2", x)) \
        .apply(lambda x: re.sub("GDSC1.*$", "GDSC_v1", x)) \
        .apply(lambda x: dataset_map[x]) \
        .cast(pl.Int64)

    compound_synonym_df = compound_synonym_df.drop_duplicates()
    compound_synonym_df["id"] = range(1, compound_synonym_df.shape[0] + 1)

    # Convert to datatable.Frame for memory mapped output file
    df = dt.Frame(compound_synonym_df.to_arrow())
    df.to_jay(os.path.join(output_dir, "compound_synonym.jay"))
Exemplo n.º 19
0
def test_compressed_csv():
    # gzip compression
    csv = """
a,b,c
1,a,1.0
2,b,2.0,
3,c,3.0
"""
    fout = io.BytesIO()
    with gzip.GzipFile(fileobj=fout, mode="w") as f:
        f.write(csv.encode())

    csv_bytes = fout.getvalue()
    out = pl.read_csv(csv_bytes)
    expected = pl.DataFrame({
        "a": [1, 2, 3],
        "b": ["a", "b", "c"],
        "c": [1.0, 2.0, 3.0]
    })
    assert out.frame_equal(expected)

    # now from disk
    out = pl.read_csv("tests/files/gzipped.csv")
    assert out.frame_equal(expected)

    # now with column projection
    out = pl.read_csv(csv_bytes, columns=["a", "b"])
    expected = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
    assert out.frame_equal(expected)

    # zlib compression
    csv_bytes = zlib.compress(csv.encode())
    out = pl.read_csv(csv_bytes)
    expected = pl.DataFrame({
        "a": [1, 2, 3],
        "b": ["a", "b", "c"],
        "c": [1.0, 2.0, 3.0]
    })
    assert out.frame_equal(expected)

    # no compression
    f = io.BytesIO(b"a, b\n1,2\n")
    out = pl.read_csv(f)
    expected = pl.DataFrame({"a": [1], "b": [2]})
    assert out.frame_equal(expected)
Exemplo n.º 20
0
def test_dtype_overwrite_with_column_name_selection() -> None:
    csv = textwrap.dedent("""\
        a,b,c,d
        1,2,3,4
        1,2,3,4
        """)
    f = io.StringIO(csv)
    df = pl.read_csv(f, columns=["c", "b", "d"], dtypes=[pl.Int32, pl.Utf8])
    assert df.dtypes == [pl.Utf8, pl.Int32, pl.Int64]
Exemplo n.º 21
0
def test_partial_dtype_overwrite():
    csv = """
a,b,c
1,2,3
1,2,3
"""
    f = io.StringIO(csv)
    df = pl.read_csv(f, dtype=[pl.Utf8])
    assert df.dtypes == [pl.Utf8, pl.Int64, pl.Int64]
Exemplo n.º 22
0
def test_partial_dtype_overwrite() -> None:
    csv = textwrap.dedent("""\
        a,b,c
        1,2,3
        1,2,3
        """)
    f = io.StringIO(csv)
    df = pl.read_csv(f, dtypes=[pl.Utf8])
    assert df.dtypes == [pl.Utf8, pl.Int64, pl.Int64]
Exemplo n.º 23
0
def test_different_eol_char() -> None:
    csv = "a,1,10;b,2,20;c,3,30"
    expected = pl.DataFrame({
        "column_1": ["a", "b", "c"],
        "column_2": [1, 2, 3],
        "column_3": [10, 20, 30]
    })
    assert pl.read_csv(csv.encode(), eol_char=";",
                       has_header=False).frame_equal(expected)
Exemplo n.º 24
0
def test_read_csv_buffer_ownership() -> None:
    buf = io.BytesIO(b"\xf0\x9f\x98\x80,5.55,333\n\xf0\x9f\x98\x86,-5.0,666")
    df = pl.read_csv(
        buf,
        has_header=False,
        new_columns=["emoji", "flt", "int"],
    )
    # confirm that read_csv succeeded, and didn't close the input buffer (#2696)
    assert df.shape == (2, 3)
    assert not buf.closed
Exemplo n.º 25
0
def test_read_csv_columns_argument(col_input: Union[List[int], List[str]],
                                   col_out: List[str]) -> None:
    csv = """a,b,c
    1,2,3
    1,2,3
    """
    f = io.StringIO(csv)
    df = pl.read_csv(f, columns=col_input)
    assert df.shape[0] == 2
    assert df.columns == col_out
Exemplo n.º 26
0
def test_partial_decompression(foods_csv: str) -> None:
    fout = io.BytesIO()
    with open(foods_csv, "rb") as fread:
        with gzip.GzipFile(fileobj=fout, mode="w") as f:
            f.write(fread.read())

    csv_bytes = fout.getvalue()
    for n_rows in [1, 5, 26]:
        out = pl.read_csv(csv_bytes, n_rows=n_rows)
        assert out.shape == (n_rows, 4)
Exemplo n.º 27
0
def test_invalid_utf8() -> None:
    np.random.seed(1)
    bts = bytes(np.random.randint(0, 255, 200))
    file = path.join(path.dirname(__file__), "nonutf8.csv")

    with open(file, "wb") as f:
        f.write(bts)

    a = pl.read_csv(file, has_headers=False, encoding="utf8-lossy")
    b = pl.scan_csv(file, has_headers=False, encoding="utf8-lossy").collect()
    assert a.frame_equal(b, null_equal=True)
Exemplo n.º 28
0
def test_partial_column_rename():
    csv = """
a,b,c
1,2,3
1,2,3
"""
    f = io.StringIO(csv)
    for use in [True, False]:
        f.seek(0)
        df = pl.read_csv(f, new_columns=["foo"], use_pyarrow=use)
        assert df.columns == ["foo", "b", "c"]
Exemplo n.º 29
0
def test_csv_whitepsace_delimiter_at_end_do_not_skip() -> None:
    csv = "0\t1\t\t\t\t"
    assert pl.read_csv(csv.encode(), sep="\t",
                       has_header=False).to_dict(False) == {
                           "column_1": [0],
                           "column_2": [1],
                           "column_3": [None],
                           "column_4": [None],
                           "column_5": [None],
                           "column_6": [None],
                       }
Exemplo n.º 30
0
def test_to_from_buffer(df: pl.DataFrame) -> None:
    buf = io.BytesIO()
    df.write_csv(buf)
    buf.seek(0)

    read_df = pl.read_csv(buf, parse_dates=True)

    read_df = read_df.with_columns(
        [pl.col("cat").cast(pl.Categorical), pl.col("time").cast(pl.Time)]
    )
    assert df.frame_equal(read_df)