Пример #1
0
def test_iread_error():
    sources = ["A\n1", "A,B\n1,2\n3,4,5\n", "D\n"]
    with pytest.warns(dt.exceptions.IOWarning):
        DTs = list(dt.iread(sources))
        assert len(DTs) == 2
    with pytest.warns(dt.exceptions.IOWarning):
        DTs = list(dt.iread(sources, errors="warn"))
        assert len(DTs) == 2
    with pytest.raises(dt.exceptions.IOError):
        DTs = list(dt.iread(sources, errors="raise"))
    # no errors / warnings
    DTs = list(dt.iread(sources, errors="ignore"))
    assert len(DTs) == 2
    # store error objects
    DTs = list(dt.iread(sources, errors="store"))
    assert len(DTs) == 3
Пример #2
0
def test_iread_simple():
    sources = ["A\n1", "A\n2\n3\n", "A\n3\n4\n5"]
    for i, DT in enumerate(dt.iread(sources)):
        assert isinstance(DT, dt.Frame)
        assert DT.source == "<text>"
        assert DT.names == ("A",)
        assert DT.shape == (i + 1, 1)
        assert DT.to_list() == [list(range(i+1, 2*(i+1)))]
Пример #3
0
def test_diabetes_tiny_two_sheets_xlsx():
    filename = find_file("h2o-3", "fread", "diabetes_tiny_two_sheets.xlsx")
    DTs_keys = [filename + "/Sheet1/A1:AY17", filename + "/Sheet2/A1:AY17"]
    DT1, DT2 = list(dt.iread(filename))
    assert sorted([DT1.source, DT2.source]) == DTs_keys
    assert DT1.shape == DT2.shape == (16, 51)
    assert DT1.stypes == DT2.stypes
    assert DT1.to_list() == DT2.to_list()
Пример #4
0
def test_fread_zip_file_bad1(tempfile):
    import zipfile
    zfname = tempfile + ".zip"
    with zipfile.ZipFile(zfname, "x"):
        pass
    DT = dt.fread(zfname)
    assert_equals(DT, dt.Frame())
    DTs = list(dt.iread(zfname))
    assert len(DTs) == 0
    os.unlink(zfname)
Пример #5
0
def test_iread_tar_gz(tempfile):
    import tarfile
    outfile = tempfile + ".tar.gz"
    with tarfile.open(outfile, "w:gz") as tf:
        with open(tempfile, 'w') as out:
            out.write("1\n2\n3\n")
        tf.add(tempfile, arcname='one')
        with open(tempfile, 'w') as out:
            out.write("4\n5\n6\n")
        tf.add(tempfile, arcname='two')
        with open(tempfile, 'w') as out:
            out.write("7\n8\n9\n")
        tf.add(tempfile, arcname='three')
    for i, DT in enumerate(dt.iread(outfile)):
        assert DT.source == os.path.join(outfile, ["one", "two", "three"][i])
        assert DT.shape == (3, 1)
        assert DT.to_list()[0] == list(range(3*i + 1, 3*i + 4))
Пример #6
0
def test_cannot_create_from_multiple_files(tempfile):
    file1 = tempfile + ".1.csv"
    file2 = tempfile + ".2.csv"
    file3 = tempfile + ".3.csv"
    try:
        with open(file1, "w") as o1, open(file2, "w") as o2, \
                open(file3, "w") as o3:
            o1.write("A,B\nfoo,2\n")
            o2.write("3\n4\n5\n6\n")
            o3.write("qw\n1\n2\n5\n")
        ff = dt.iread(tempfile + ".*.csv")
        assert len(list(ff)) == 3
        msg = r"fread\(\) input contains multiple sources"
        with pytest.raises(IOError, match=msg):
            dt.Frame(tempfile + ".*.csv")
    finally:
        os.remove(file1)
        os.remove(file2)
Пример #7
0
def test_fread_from_glob(tempfile):
    base, ext = os.path.splitext(tempfile)
    if not ext:
        ext = ".csv"
    pattern = base + "*" + ext
    tempfiles = ["".join([base, str(i), ext]) for i in range(10)]
    try:
        for j in range(10):
            with open(tempfiles[j], "w") as f:
                f.write("A,B,C\n0,0,0\n%d,%d,%d\n"
                        % (j, j * 2 + 1, (j + 3) * 17 % 23))
        res = dt.iread(pattern)
        assert res.__class__.__name__ == "read_iterator"
        res = list(res)
        assert len(res) == 10
        assert set(DTj.source for DTj in res) == set(tempfiles)
        # The glob pattern tempfile*.csv may have returned the files in a
        # shuffled order, need to sort them back from 0 to 9:
        res = sorted(res, key=lambda DTj: DTj.source)
        for j in range(10):
            DTj = res[j]
            assert isinstance(DTj, dt.Frame)
            frame_integrity_check(DTj)
            assert DTj.names == ("A", "B", "C")
            assert DTj.shape == (2, 3)
        df = dt.rbind(res)
        frame_integrity_check(df)
        assert df.names == ("A", "B", "C")
        assert df.shape == (20, 3)
        assert df.to_list() == [
            [0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9],
            [0, 1, 0, 3, 0, 5, 0, 7, 0, 9, 0, 11, 0, 13, 0, 15, 0, 17, 0, 19],
            [0, 5, 0, 22, 0, 16, 0, 10, 0, 4, 0, 21, 0, 15, 0, 9, 0, 3, 0, 20]
        ]
    finally:
        for f in tempfiles:
            os.remove(f)
Пример #8
0
def load_table(name, data_dir):
    """
    Load all PSet tables with name into a datatable, dropping any duplicate rows.

    @param name: [`string`] The name of the table
    @param data_dir: [`string`] File path to the directory with all PSet tables
    @return: [`datatable.Frame`] A datatable containing all rows from all PSets
    """
    # Get all files
    files = glob.glob(os.path.join(data_dir, '**', f'*{name}.csv'))
    # Filter so that file path are '{data_dir}/{pset}/{pset}_{name}.csv'
    files = [
        file_name for file_name in files
        if re.search(data_dir + r'/(\w+)/\1_' + name + '.csv$', file_name)
    ]
    # Read and concatenate tables
    df = rbind(*iread(files, sep=','))
    # Replace any empty strings with None/NA
    df.replace("", None)
    # Drop duplicates
    # (groups by all columns and selects only the first row from each group)
    df = df[0, :, by(df.names)]

    return df
Пример #9
0
def test_issue2621_b():
    src = """c1, c2, c3
             11, 2, 3"""
    RES = dt.rbind(dt.iread([src, src]))
    assert_equals(RES, dt.Frame(c1=[11, 11], c2=[2, 2], c3=[3, 3]))
}
# pset_tables: ["dose_response", "drug", "datasets_cells",
#     "dataset_statistics", "cell", "drug_annotation", "gene_drug",
#     "profile", "dataset", "mol_cell", "gene_annotation", "dataset_cell",
#     "experiment", "tissue", "gene"]'

pset_name = psets[3]  # GDSC_v1

# -- Read in a single .csv
experiment = fread(
    os.path.join(data_dir, pset_name, pset_tables[pset_name][-3],
                 f'*{pset_tables[pset_name][-3]}*.csv'))

# -- Read in multiple .csv files and make a single Frame
dose_response = rbind(*iread(
    os.path.join(data_dir, pset_name, pset_tables[pset_name][0],
                 f'*{pset_tables[pset_name][0]}*.csv')))

# Can use pattern matching to read in multiple files; ** will match any number of subdirectories
# Should make path parsing code much more compact
all_cell_tables = rbind(
    *iread(os.path.join(data_dir, '**', 'cell', '*cell.csv')))

# -- Write to csv
dose_response.to_csv(
    os.path.join(output_dir, f'{pset_tables[pset_name][0]}.csv'))

# -- Select (of the form df[filter, select, ...])
# f is for Frame and references variables within the Frame object (i.e., columns)
dose_response[:, [f.id, f.experiment_id]]