Пример #1
0
def test_read_write_csv_dialect(tmpdir):
    expected = [['this is some text', 'scandal', 42.0],
                ["here's some more text: boom!", 'escándalo', 1.0]]
    filename = str(tmpdir.join('test_read_write_csv.csv'))
    io.write_csv(expected, filename, dialect='excel', make_dirs=True)
    observed = list(io.read_csv(filename, dialect='infer'))
    assert observed == expected
Пример #2
0
 def test_read_write_dict(self, tmpdir):
     expected = [
         {
             "text": "this is some text",
             "kind": "scandal",
             "number": 42.0
         },
         {
             "text": "here's some more text: boom!",
             "kind": "escándalo",
             "number": 1.0
         },
     ]
     filepath = str(tmpdir.join("test_read_write_csv_dict.csv"))
     io.write_csv(
         expected,
         filepath,
         dialect="excel",
         make_dirs=True,
         fieldnames=["text", "kind", "number"],
     )
     observed = [
         dict(item)
         for item in io.read_csv(filepath,
                                 dialect="excel",
                                 fieldnames=["text", "kind", "number"])
     ]
     assert observed == expected
Пример #3
0
    def load(self, exclude: Optional[Set[str]] = None) -> Dict[str, str]:
        """
        Args:
            exclude

        Returns:
            Dict[str, str]
        """
        rows = tio.read_csv(
            self.data_dir.joinpath(self.filename),
            delimiter="\t",
            fieldnames=[
                "Id",
                "Part2B",
                "Part2T",
                "Part1",
                "Scope",
                "Language_Type",
                "Ref_Name",
                "Comment",
            ],
            quoting=1,
        )
        lang_map = {
            row["Id"]: row["Part1"]
            for row in rows
            if row.get("Part1") and (exclude is None or row["Part1"] not in exclude)
        }
        LOGGER.info(
            "loaded IsoLangResource data:\n%s ...",
            sorted(lang_map.items())[:5],
        )
        return lang_map
Пример #4
0
    def load(
        self,
        iso_lang_map: Dict[str, str],
        min_len: int = 25,
    ) -> List[Tuple[str, str]]:
        """
        Args:
            iso_lang_map
            min_len: Minimum text length in *chars* for a given example to be included.

        Returns:
            Sequence of (text, lang) examples.
        """
        rows = tio.read_csv(
            self.data_dir.joinpath("sentences.csv"),
            fieldnames=["sent_id", "iso-639-3", "text"],
            delimiter="\t",
            quoting=1,
        )
        data = [
            (row["text"], iso_lang_map[row["iso-639-3"]])
            for row in rows
            if row["iso-639-3"] in iso_lang_map
            and itertoolz.count(char for char in row["text"] if char.isalnum()) >= min_len
        ]
        LOGGER.info("loaded TatoebaDataset data:\n%s ...", data[:3])
        return data
Пример #5
0
def test_read_write_csv_delimiters(tmpdir):
    expected = [['this is some text', 'scandal', 42.0],
                ["here's some more text: boom!", 'escándalo', 1.0]]
    for delimiter in (',', '\t', '|', ':'):
        filename = str(tmpdir.join('test_read_write_csv.csv'))
        io.write_csv(expected, filename, delimiter=delimiter, make_dirs=True)
        observed = list(io.read_csv(filename, delimiter=delimiter))
        assert observed == expected
Пример #6
0
 def test_read_write_dialect(self, tmpdir):
     expected = [
         ["this is some text", "scandal", 42.0],
         ["here's some more text: boom!", "escándalo", 1.0],
     ]
     filepath = str(tmpdir.join("test_read_write_csv.csv"))
     io.write_csv(expected, filepath, dialect="excel", make_dirs=True)
     observed = list(io.read_csv(filepath, dialect="infer"))
     assert observed == expected
Пример #7
0
 def test_read_write_compressed(self, tmpdir):
     expected = [
         ["this is some text", "scandal", 42.0],
         ["here's some more text: boom!", "escándalo", 1.0],
     ]
     for ext in (".csv", ".csv.gz", ".csv.bz2", ".csv.xz"):
         filepath = str(tmpdir.join("test_read_write_csv" + ext))
         io.write_csv(expected, filepath, make_dirs=True)
         observed = list(io.read_csv(filepath))
         assert observed == expected
Пример #8
0
 def test_read_write_delimiters(self, tmpdir):
     expected = [
         ["this is some text", "scandal", 42.0],
         ["here's some more text: boom!", "escándalo", 1.0],
     ]
     for delimiter in (",", "\t", "|", ":"):
         filepath = str(tmpdir.join("test_read_write_csv.csv"))
         io.write_csv(expected, filepath, delimiter=delimiter, make_dirs=True)
         observed = list(io.read_csv(filepath, delimiter=delimiter))
         assert observed == expected
Пример #9
0
def test_read_write_csv_dict(tmpdir):
    expected = [
        {'text': 'this is some text', 'kind': 'scandal', 'number': 42.0},
        {'text': "here's some more text: boom!", 'kind': 'escándalo', 'number': 1.0}
    ]
    filename = str(tmpdir.join('test_read_write_csv_dict.csv'))
    io.write_csv(
        expected, filename, dialect='excel', make_dirs=True,
        fieldnames=['text', 'kind', 'number'])
    observed = [
        dict(item) for item in
        io.read_csv(filename, dialect='excel', fieldnames=['text', 'kind', 'number'])]
    assert observed == expected
Пример #10
0
def test_read_write_csv_compressed(tmpdir):
    expected = [['this is some text', 'scandal', 42.0],
                ["here's some more text: boom!", 'escándalo', 1.0]]
    for ext in ('.csv', '.csv.gz', '.csv.bz2', '.csv.xz'):
        filename = str(tmpdir.join('test_read_write_csv' + ext))
        if compat.is_python2 is True and ext != '.csv':
            with pytest.raises(ValueError):
                io.open_sesame(
                    filename, mode='wt', encoding=None, make_dirs=True)
        else:
            io.write_csv(expected, filename, make_dirs=True)
            observed = list(io.read_csv(filename))
            assert observed == expected
Пример #11
0
 def test_read_write_compressed(self, tmpdir):
     expected = [
         ["this is some text", "scandal", 42.0],
         ["here's some more text: boom!", "escándalo", 1.0],
     ]
     for ext in (".csv", ".csv.gz", ".csv.bz2", ".csv.xz"):
         filepath = str(tmpdir.join("test_read_write_csv" + ext))
         if compat.PY2 is True and ext != ".csv":
             with pytest.raises(ValueError):
                 io.open_sesame(filepath, mode="wt", encoding=None, make_dirs=True)
         else:
             io.write_csv(expected, filepath, make_dirs=True)
             observed = list(io.read_csv(filepath))
             assert observed == expected