def test_read_write_csv_dialect(tmpdir): expected = [['this is some text', 'scandal', 42.0], ["here's some more text: boom!", 'escándalo', 1.0]] filename = str(tmpdir.join('test_read_write_csv.csv')) io.write_csv(expected, filename, dialect='excel', make_dirs=True) observed = list(io.read_csv(filename, dialect='infer')) assert observed == expected
def test_read_write_dict(self, tmpdir): expected = [ { "text": "this is some text", "kind": "scandal", "number": 42.0 }, { "text": "here's some more text: boom!", "kind": "escándalo", "number": 1.0 }, ] filepath = str(tmpdir.join("test_read_write_csv_dict.csv")) io.write_csv( expected, filepath, dialect="excel", make_dirs=True, fieldnames=["text", "kind", "number"], ) observed = [ dict(item) for item in io.read_csv(filepath, dialect="excel", fieldnames=["text", "kind", "number"]) ] assert observed == expected
def load(self, exclude: Optional[Set[str]] = None) -> Dict[str, str]: """ Args: exclude Returns: Dict[str, str] """ rows = tio.read_csv( self.data_dir.joinpath(self.filename), delimiter="\t", fieldnames=[ "Id", "Part2B", "Part2T", "Part1", "Scope", "Language_Type", "Ref_Name", "Comment", ], quoting=1, ) lang_map = { row["Id"]: row["Part1"] for row in rows if row.get("Part1") and (exclude is None or row["Part1"] not in exclude) } LOGGER.info( "loaded IsoLangResource data:\n%s ...", sorted(lang_map.items())[:5], ) return lang_map
def load( self, iso_lang_map: Dict[str, str], min_len: int = 25, ) -> List[Tuple[str, str]]: """ Args: iso_lang_map min_len: Minimum text length in *chars* for a given example to be included. Returns: Sequence of (text, lang) examples. """ rows = tio.read_csv( self.data_dir.joinpath("sentences.csv"), fieldnames=["sent_id", "iso-639-3", "text"], delimiter="\t", quoting=1, ) data = [ (row["text"], iso_lang_map[row["iso-639-3"]]) for row in rows if row["iso-639-3"] in iso_lang_map and itertoolz.count(char for char in row["text"] if char.isalnum()) >= min_len ] LOGGER.info("loaded TatoebaDataset data:\n%s ...", data[:3]) return data
def test_read_write_csv_delimiters(tmpdir): expected = [['this is some text', 'scandal', 42.0], ["here's some more text: boom!", 'escándalo', 1.0]] for delimiter in (',', '\t', '|', ':'): filename = str(tmpdir.join('test_read_write_csv.csv')) io.write_csv(expected, filename, delimiter=delimiter, make_dirs=True) observed = list(io.read_csv(filename, delimiter=delimiter)) assert observed == expected
def test_read_write_dialect(self, tmpdir): expected = [ ["this is some text", "scandal", 42.0], ["here's some more text: boom!", "escándalo", 1.0], ] filepath = str(tmpdir.join("test_read_write_csv.csv")) io.write_csv(expected, filepath, dialect="excel", make_dirs=True) observed = list(io.read_csv(filepath, dialect="infer")) assert observed == expected
def test_read_write_compressed(self, tmpdir): expected = [ ["this is some text", "scandal", 42.0], ["here's some more text: boom!", "escándalo", 1.0], ] for ext in (".csv", ".csv.gz", ".csv.bz2", ".csv.xz"): filepath = str(tmpdir.join("test_read_write_csv" + ext)) io.write_csv(expected, filepath, make_dirs=True) observed = list(io.read_csv(filepath)) assert observed == expected
def test_read_write_delimiters(self, tmpdir): expected = [ ["this is some text", "scandal", 42.0], ["here's some more text: boom!", "escándalo", 1.0], ] for delimiter in (",", "\t", "|", ":"): filepath = str(tmpdir.join("test_read_write_csv.csv")) io.write_csv(expected, filepath, delimiter=delimiter, make_dirs=True) observed = list(io.read_csv(filepath, delimiter=delimiter)) assert observed == expected
def test_read_write_csv_dict(tmpdir): expected = [ {'text': 'this is some text', 'kind': 'scandal', 'number': 42.0}, {'text': "here's some more text: boom!", 'kind': 'escándalo', 'number': 1.0} ] filename = str(tmpdir.join('test_read_write_csv_dict.csv')) io.write_csv( expected, filename, dialect='excel', make_dirs=True, fieldnames=['text', 'kind', 'number']) observed = [ dict(item) for item in io.read_csv(filename, dialect='excel', fieldnames=['text', 'kind', 'number'])] assert observed == expected
def test_read_write_csv_compressed(tmpdir): expected = [['this is some text', 'scandal', 42.0], ["here's some more text: boom!", 'escándalo', 1.0]] for ext in ('.csv', '.csv.gz', '.csv.bz2', '.csv.xz'): filename = str(tmpdir.join('test_read_write_csv' + ext)) if compat.is_python2 is True and ext != '.csv': with pytest.raises(ValueError): io.open_sesame( filename, mode='wt', encoding=None, make_dirs=True) else: io.write_csv(expected, filename, make_dirs=True) observed = list(io.read_csv(filename)) assert observed == expected
def test_read_write_compressed(self, tmpdir): expected = [ ["this is some text", "scandal", 42.0], ["here's some more text: boom!", "escándalo", 1.0], ] for ext in (".csv", ".csv.gz", ".csv.bz2", ".csv.xz"): filepath = str(tmpdir.join("test_read_write_csv" + ext)) if compat.PY2 is True and ext != ".csv": with pytest.raises(ValueError): io.open_sesame(filepath, mode="wt", encoding=None, make_dirs=True) else: io.write_csv(expected, filepath, make_dirs=True) observed = list(io.read_csv(filepath)) assert observed == expected