def test_word_vector_resource(self): path = os.path.join(os.path.dirname(__file__), "./data") storage = Storage(path) vocab = Vocabulary() vocab.set(["you", "loaded", "word", "vector", "now"]) vector_size = 50 word2vec = [ "you " + " ".join(["0"] * vector_size), "word " + " ".join(["1"] * vector_size), "now " + " ".join(["2"] * vector_size), ] word2vec_file = Path(storage.path("external/word2vec_dummyr.txt")) with word2vec_file.open(mode="w", encoding="utf-8") as f: f.write("\n".join(word2vec)) wv = WordVector(word2vec_file) key_vector = wv.load() for k in key_vector: self.assertTrue(k in vocab.get()) self.assertEqual(len(key_vector[k]), vector_size) embed = vocab.make_embedding(word2vec_file) self.assertEqual(embed.shape, (len(vocab.get()), vector_size))
def test_chazutsu(self): path = os.path.join(os.path.dirname(__file__), "../data") storage = Storage(path) r = chazutsu.datasets.DUC2004().download(storage.path("raw")) df = storage.chazutsu(r.root).data() print(df.head(5)) shutil.rmtree(r.root)
def test_read_file(self): path = os.path.join(os.path.dirname(__file__), "../data") storage = Storage(path) csv = DataFile(storage.path("raw/sample_dataset.csv")) content = csv.to_array() fetched = list(csv.fetch(progress=True)) for c, f in zip(content, fetched): self.assertEqual(c, f)
def test_convert(self): path = os.path.join(os.path.dirname(__file__), "../data") storage = Storage(path) csv = DataFile(storage.path("raw/sample_dataset.csv")) path_changed = csv.convert(data_dir_to="interim") correct = os.path.join(path, "./interim/sample_dataset.csv") self.assertEqual(resolve(path_changed.path), resolve(correct)) attr_added = csv.convert(add_attribute="preprocessed") correct = storage.path("raw/sample_dataset__preprocessed.csv") self.assertEqual(resolve(attr_added.path), resolve(correct)) attr_converted = attr_added.convert( attribute_to={"preprocessed": "converted"}) correct = storage.path("raw/sample_dataset__converted.csv") self.assertEqual(resolve(attr_converted.path), resolve(correct)) ext_changed = csv.convert(ext_to=".txt") correct = storage.path("raw/sample_dataset.txt") self.assertEqual(resolve(ext_changed.path), resolve(correct))
def test_path(self): root = os.path.join(os.path.dirname(__file__), "../../data") storage = Storage(root) correct_path = os.path.join(root, "raw") self.assertEqual(resolve(storage.path("raw")), resolve(correct_path))