def test_iread_error(): sources = ["A\n1", "A,B\n1,2\n3,4,5\n", "D\n"] with pytest.warns(dt.exceptions.IOWarning): DTs = list(dt.iread(sources)) assert len(DTs) == 2 with pytest.warns(dt.exceptions.IOWarning): DTs = list(dt.iread(sources, errors="warn")) assert len(DTs) == 2 with pytest.raises(dt.exceptions.IOError): DTs = list(dt.iread(sources, errors="raise")) # no errors / warnings DTs = list(dt.iread(sources, errors="ignore")) assert len(DTs) == 2 # store error objects DTs = list(dt.iread(sources, errors="store")) assert len(DTs) == 3
def test_iread_simple(): sources = ["A\n1", "A\n2\n3\n", "A\n3\n4\n5"] for i, DT in enumerate(dt.iread(sources)): assert isinstance(DT, dt.Frame) assert DT.source == "<text>" assert DT.names == ("A",) assert DT.shape == (i + 1, 1) assert DT.to_list() == [list(range(i+1, 2*(i+1)))]
def test_diabetes_tiny_two_sheets_xlsx(): filename = find_file("h2o-3", "fread", "diabetes_tiny_two_sheets.xlsx") DTs_keys = [filename + "/Sheet1/A1:AY17", filename + "/Sheet2/A1:AY17"] DT1, DT2 = list(dt.iread(filename)) assert sorted([DT1.source, DT2.source]) == DTs_keys assert DT1.shape == DT2.shape == (16, 51) assert DT1.stypes == DT2.stypes assert DT1.to_list() == DT2.to_list()
def test_fread_zip_file_bad1(tempfile): import zipfile zfname = tempfile + ".zip" with zipfile.ZipFile(zfname, "x"): pass DT = dt.fread(zfname) assert_equals(DT, dt.Frame()) DTs = list(dt.iread(zfname)) assert len(DTs) == 0 os.unlink(zfname)
def test_iread_tar_gz(tempfile): import tarfile outfile = tempfile + ".tar.gz" with tarfile.open(outfile, "w:gz") as tf: with open(tempfile, 'w') as out: out.write("1\n2\n3\n") tf.add(tempfile, arcname='one') with open(tempfile, 'w') as out: out.write("4\n5\n6\n") tf.add(tempfile, arcname='two') with open(tempfile, 'w') as out: out.write("7\n8\n9\n") tf.add(tempfile, arcname='three') for i, DT in enumerate(dt.iread(outfile)): assert DT.source == os.path.join(outfile, ["one", "two", "three"][i]) assert DT.shape == (3, 1) assert DT.to_list()[0] == list(range(3*i + 1, 3*i + 4))
def test_cannot_create_from_multiple_files(tempfile): file1 = tempfile + ".1.csv" file2 = tempfile + ".2.csv" file3 = tempfile + ".3.csv" try: with open(file1, "w") as o1, open(file2, "w") as o2, \ open(file3, "w") as o3: o1.write("A,B\nfoo,2\n") o2.write("3\n4\n5\n6\n") o3.write("qw\n1\n2\n5\n") ff = dt.iread(tempfile + ".*.csv") assert len(list(ff)) == 3 msg = r"fread\(\) input contains multiple sources" with pytest.raises(IOError, match=msg): dt.Frame(tempfile + ".*.csv") finally: os.remove(file1) os.remove(file2)
def test_fread_from_glob(tempfile): base, ext = os.path.splitext(tempfile) if not ext: ext = ".csv" pattern = base + "*" + ext tempfiles = ["".join([base, str(i), ext]) for i in range(10)] try: for j in range(10): with open(tempfiles[j], "w") as f: f.write("A,B,C\n0,0,0\n%d,%d,%d\n" % (j, j * 2 + 1, (j + 3) * 17 % 23)) res = dt.iread(pattern) assert res.__class__.__name__ == "read_iterator" res = list(res) assert len(res) == 10 assert set(DTj.source for DTj in res) == set(tempfiles) # The glob pattern tempfile*.csv may have returned the files in a # shuffled order, need to sort them back from 0 to 9: res = sorted(res, key=lambda DTj: DTj.source) for j in range(10): DTj = res[j] assert isinstance(DTj, dt.Frame) frame_integrity_check(DTj) assert DTj.names == ("A", "B", "C") assert DTj.shape == (2, 3) df = dt.rbind(res) frame_integrity_check(df) assert df.names == ("A", "B", "C") assert df.shape == (20, 3) assert df.to_list() == [ [0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9], [0, 1, 0, 3, 0, 5, 0, 7, 0, 9, 0, 11, 0, 13, 0, 15, 0, 17, 0, 19], [0, 5, 0, 22, 0, 16, 0, 10, 0, 4, 0, 21, 0, 15, 0, 9, 0, 3, 0, 20] ] finally: for f in tempfiles: os.remove(f)
def load_table(name, data_dir): """ Load all PSet tables with name into a datatable, dropping any duplicate rows. @param name: [`string`] The name of the table @param data_dir: [`string`] File path to the directory with all PSet tables @return: [`datatable.Frame`] A datatable containing all rows from all PSets """ # Get all files files = glob.glob(os.path.join(data_dir, '**', f'*{name}.csv')) # Filter so that file path are '{data_dir}/{pset}/{pset}_{name}.csv' files = [ file_name for file_name in files if re.search(data_dir + r'/(\w+)/\1_' + name + '.csv$', file_name) ] # Read and concatenate tables df = rbind(*iread(files, sep=',')) # Replace any empty strings with None/NA df.replace("", None) # Drop duplicates # (groups by all columns and selects only the first row from each group) df = df[0, :, by(df.names)] return df
def test_issue2621_b(): src = """c1, c2, c3 11, 2, 3""" RES = dt.rbind(dt.iread([src, src])) assert_equals(RES, dt.Frame(c1=[11, 11], c2=[2, 2], c3=[3, 3]))
} # pset_tables: ["dose_response", "drug", "datasets_cells", # "dataset_statistics", "cell", "drug_annotation", "gene_drug", # "profile", "dataset", "mol_cell", "gene_annotation", "dataset_cell", # "experiment", "tissue", "gene"]' pset_name = psets[3] # GDSC_v1 # -- Read in a single .csv experiment = fread( os.path.join(data_dir, pset_name, pset_tables[pset_name][-3], f'*{pset_tables[pset_name][-3]}*.csv')) # -- Read in multiple .csv files and make a single Frame dose_response = rbind(*iread( os.path.join(data_dir, pset_name, pset_tables[pset_name][0], f'*{pset_tables[pset_name][0]}*.csv'))) # Can use pattern matching to read in multiple files; ** will match any number of subdirectories # Should make path parsing code much more compact all_cell_tables = rbind( *iread(os.path.join(data_dir, '**', 'cell', '*cell.csv'))) # -- Write to csv dose_response.to_csv( os.path.join(output_dir, f'{pset_tables[pset_name][0]}.csv')) # -- Select (of the form df[filter, select, ...]) # f is for Frame and references variables within the Frame object (i.e., columns) dose_response[:, [f.id, f.experiment_id]]