def compute_kmeans(corpus: VectorizedCorpus, tokens: List[str] = None, n_clusters: int = 8, **kwargs): """Computes KMeans clusters using `sklearn.cluster.KMeans`(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html)""" data: scipy.sparse.spmatrix = corpus.data if tokens is None else corpus.data[:, corpus.token_indices(tokens)] km = sklearn.cluster.KMeans(n_clusters=n_clusters, **kwargs).fit(data.T) return KMeansCorpusClusters(corpus, tokens, KMeansResult(centroids=km.cluster_centers_, labels=km.labels_))
def co_occurrence_corpus_to_co_occurrence( *, coo_corpus: VectorizedCorpus, token2id: Token2Id, ) -> CoOccurrenceDataFrame: """Creates a co-occurrence data frame from a co-occurrence DTM corpus.""" return coo_corpus.to_co_occurrences(token2id)
def simple_corpus_with_pivot_keys(): corpus = VectorizedCorpus( bag_term_matrix=np.array([ [2, 1, 4, 1], [2, 2, 3, 0], [2, 3, 2, 0], [2, 4, 1, 1], [2, 0, 1, 1], ]), token2id={ 'a': 0, 'b': 1, 'c': 2, 'd': 3 }, document_index=pd.DataFrame( { 'year': [2009, 2013, 2014, 2017, 2017], 'color_id': [0, 0, 1, 2, 3], 'cov_id': [1, 1, 2, 2, 3], 'document_id': [0, 1, 2, 3, 4], 'document_name': [f'doc_{y}_{i}' for i, y in enumerate(range(0, 5))], 'filename': [f'doc_{y}_{i}.txt' for i, y in enumerate(range(0, 5))], }, dtype=np.int16, ), ) return corpus
def test_dump_and_store_of_corpus_with_empty_trailing_row( ) -> VectorizedCorpus: os.makedirs(OUTPUT_FOLDER, exist_ok=True) bag_term_matrix = np.array([[2, 1, 4, 1], [2, 2, 3, 0], [0, 0, 0, 0]]) token2id = {'a': 0, 'b': 1, 'c': 2, 'd': 3} document_index = pd.DataFrame({'year': [2013, 2013, 2014]}) corpus: VectorizedCorpus = VectorizedCorpus(bag_term_matrix, token2id=token2id, document_index=document_index) corpus.dump(tag="ZERO", folder="./tests/output") loaded_corpus = VectorizedCorpus.load(tag="ZERO", folder="./tests/output") assert corpus.data.shape == loaded_corpus.data.shape
def test_co_occurrence_matrix(corpus: VectorizedCorpus): m = corpus.co_occurrence_matrix() assert m is not None assert (m == np.matrix([ [0, 20, 22, 6], [0, 0, 20, 5], [0, 0, 0, 6], [0, 0, 0, 0], ])).all()
def test_to_bag_of_terms(corpus: VectorizedCorpus): expected_docs = [ ['a', 'a', 'b', 'c', 'c', 'c', 'c', 'd'], ['a', 'a', 'b', 'b', 'c', 'c', 'c'], ['a', 'a', 'b', 'b', 'b', 'c', 'c'], ['a', 'a', 'b', 'b', 'b', 'b', 'c', 'd'], ['a', 'a', 'c', 'd'], ] assert [list(x) for x in corpus.to_bag_of_terms()] == expected_docs
def create_vectorized_corpus(): bag_term_matrix = np.array([[2, 1, 4, 1], [2, 2, 3, 0], [2, 3, 2, 0], [2, 4, 1, 1], [2, 0, 1, 1]]) token2id = {'a': 0, 'b': 1, 'c': 2, 'd': 3} document_index = pd.DataFrame({'year': [2013, 2013, 2014, 2014, 2014]}) corpus = VectorizedCorpus(bag_term_matrix, token2id=token2id, document_index=document_index) return corpus
def compute_kmeans2(corpus: VectorizedCorpus, tokens: List[str] = None, n_clusters: int = 8, **kwargs): """Computes KMeans clusters using `scipy.cluster.vq.kmeans2` (https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.vq.kmeans2.html""" data: scipy.sparse.spmatrix = corpus.data if tokens is None else corpus.data[:, corpus.token_indices(tokens)] data = data.T.todense() if not np.issubdtype(data.dtype, np.floating): data = data.astype(np.float64) centroids, labels = scipy.cluster.vq.kmeans2(data, n_clusters, **kwargs) return KMeansCorpusClusters(corpus, tokens, KMeansResult(centroids=centroids, labels=labels))
def test_bag_term_matrix_to_bag_term_docs(corpus: VectorizedCorpus): doc_ids = ( 0, 1, ) expected = [['a', 'a', 'b', 'c', 'c', 'c', 'c', 'd'], ['a', 'a', 'b', 'b', 'c', 'c', 'c']] docs = corpus.to_bag_of_terms(doc_ids) assert expected == ([list(d) for d in docs]) expected = [ ['a', 'a', 'b', 'c', 'c', 'c', 'c', 'd'], ['a', 'a', 'b', 'b', 'c', 'c', 'c'], ['a', 'a', 'b', 'b', 'b', 'c', 'c'], ['a', 'a', 'b', 'b', 'b', 'b', 'c', 'd'], ['a', 'a', 'c', 'd'], ] docs = corpus.to_bag_of_terms() assert expected == ([list(d) for d in docs])
def test_find_matching_indices(corpus: VectorizedCorpus): corpus._token2id = {"bengt": 0, "bertil": 1, "eva": 2, "julia": 3} # pylint: disable=protected-access assert set(corpus.find_matching_words_indices(["jens"], 4)) == set() assert set(corpus.find_matching_words_indices([], 4)) == set() assert set(corpus.find_matching_words_indices(["bengt"], 4)) == {0} assert set(corpus.find_matching_words_indices(["b*"], 4)) == {0, 1} assert set(corpus.find_matching_words_indices(["|.*a|"], 4)) == {2, 3} assert set(corpus.find_matching_words_indices(["*"], 4)) == {0, 1, 2, 3}
def corpus(self) -> VectorizedCorpus: shape: Tuple[int, int] = (len(self.document_index), len(self.pair2id)) self.matrix = sp.coo_matrix((self.data, (self.row, self.col)), shape=shape) corpus: VectorizedCorpus = VectorizedCorpus( bag_term_matrix=self.matrix.tocsr(), token2id=dict(self.pair2id.data), document_index=self.document_index.set_index('document_id', drop=False), ) return corpus
def create_smaller_vectorized_corpus(): bag_term_matrix = np.array([[2, 1, 4, 1], [2, 2, 3, 0], [2, 3, 2, 0], [2, 4, 1, 1], [2, 0, 1, 1]]) token2id = {'a': 0, 'b': 1, 'c': 2, 'd': 3} document_index = pd.DataFrame({ 'year': [2013, 2013, 2014, 2014, 2014], 'filename': ['2013.txt', '2013.txt', '2014.txt', '2014.txt', '2014.txt'], 'document_id': [0, 1, 2, 3, 4], }) v_corpus = VectorizedCorpus(bag_term_matrix, token2id=token2id, document_index=document_index) return v_corpus
def test_group_by_year_with_average(): corpus = [ "the house had a tiny little mouse", "the cat saw the mouse", "the mouse ran away from the house", "the cat finally ate the mouse", "the end of the mouse story", ] expected_bag_term_matrix = np.array([ [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1], [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 2, 0], [0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 2, 0], [1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 2, 0], ]) expected_bag_term_matrix_sums = np.array([ expected_bag_term_matrix[[0, 1, 2], :].sum(axis=0), expected_bag_term_matrix[[3, 4], :].sum(axis=0) ]) expected_bag_term_matrix_means = np.array([ expected_bag_term_matrix[[0, 1, 2], :].sum(axis=0) / 3.0, expected_bag_term_matrix[[3, 4], :].sum(axis=0) / 2.0, ]) document_index = pd.DataFrame({ 'year': [1, 1, 1, 2, 2], 'document_id': range(0, 5) }) vec = CountVectorizer() bag_term_matrix = vec.fit_transform(corpus) v_corpus: VectorizedCorpus = VectorizedCorpus( bag_term_matrix, token2id=vec.vocabulary_, document_index=document_index) assert np.allclose(expected_bag_term_matrix, bag_term_matrix.todense()) y_sum_corpus = v_corpus.group_by_year(aggregate='sum', fill_gaps=True) y_mean_corpus = v_corpus.group_by_year(aggregate='mean', fill_gaps=True) assert np.allclose(expected_bag_term_matrix_sums, y_sum_corpus.data.todense()) assert np.allclose(expected_bag_term_matrix_means, y_mean_corpus.data.todense())
def test_normalize_by_raw_counts(): corpus: VectorizedCorpus = VectorizedCorpus( bag_term_matrix=np.array([[4, 3, 7, 1], [6, 7, 4, 2]]), token2id={ 'a': 0, 'b': 1, 'c': 2, 'd': 3 }, document_index=pd.DataFrame({'year': [2013, 2014]}), ) n_corpus = corpus.normalize() t_corpus = corpus.normalize_by_raw_counts() assert np.allclose(t_corpus.data.todense(), n_corpus.data.todense())
def compute_hca( corpus: VectorizedCorpus, tokens: List[str], linkage_method: str = 'ward', linkage_metric: str = 'euclidean' ) -> HCACorpusClusters: """Computes HCA clusters using `scipy.cluster.hierarchy.linkage` (https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html""" data = corpus.data if tokens is None else corpus.data[:, corpus.token_indices(tokens)] linkage_matrix = linkage(data.T.todense(), method=linkage_method, metric=linkage_metric) """ from documentation A (n-1) by 4 matrix Z is returned. At the i-th iteration, clusters with token_ids Z[i, 0] and Z[i, 1] are combined to form cluster n + i. A cluster with an index less than n corresponds to one of the original observations. The distance between clusters Z[i, 0] and Z[i, 1] is given by Z[i, 2]. The fourth value Z[i, 3] represents the number of original observations in the newly formed cluster. """ return HCACorpusClusters(corpus, tokens, linkage_matrix)
def test_load_of_uncompressed_corpus(text_corpus): os.makedirs(OUTPUT_FOLDER, exist_ok=True) # Arrange corpus: VectorizedCorpus = CorpusVectorizer().fit_transform( text_corpus, already_tokenized=True) corpus.dump(tag='dump_test', folder=OUTPUT_FOLDER, compressed=False) # Act loaded_corpus: VectorizedCorpus = VectorizedCorpus.load( tag='dump_test', folder=OUTPUT_FOLDER) # Assert assert (corpus.term_frequency == loaded_corpus.term_frequency).all() assert corpus.document_index.to_dict( ) == loaded_corpus.document_index.to_dict() assert corpus.token2id == loaded_corpus.token2id
def store_corpus_bundle(corpus: VectorizedCorpus, args: interface.ComputeOpts): if VectorizedCorpus.dump_exists(tag=args.corpus_tag, folder=args.target_folder): VectorizedCorpus.remove(tag=args.corpus_tag, folder=args.target_folder) target_folder = args.target_folder if args.create_subfolder: if os.path.split(target_folder)[1] != args.corpus_tag: target_folder = os.path.join(target_folder, args.corpus_tag) os.makedirs(target_folder, exist_ok=True) corpus.dump(tag=args.corpus_tag, folder=target_folder) VectorizedCorpus.dump_options( tag=args.corpus_tag, folder=target_folder, options=args.props, )
def test_group_by_time_period_aggregates_DTM_to_PTM(): bag_term_matrix = np.array([[2, 1, 4, 1], [2, 2, 3, 0], [2, 3, 2, 0], [2, 4, 1, 1], [2, 0, 1, 1]]) token2id = {'a': 0, 'b': 1, 'c': 2, 'd': 3} document_index = pd.DataFrame({ 'year': [2009, 2013, 2014, 2017, 2017], 'filename': ['2009.txt', '2013.txt', '2014.txt', '2017.txt', '2017.txt'], 'document_id': [0, 1, 2, 3, 4], }) corpus = VectorizedCorpus(bag_term_matrix, token2id=token2id, document_index=document_index) grouped_corpus = corpus.group_by_time_period_optimized( time_period_specifier='year') expected_ytm = [[2, 1, 4, 1], [2, 2, 3, 0], [2, 3, 2, 0], [4, 4, 2, 2]] assert np.allclose(expected_ytm, grouped_corpus.bag_term_matrix.todense()) grouped_corpus = corpus.group_by_time_period_optimized( time_period_specifier='lustrum') expected_ytm = [[2, 1, 4, 1], [4, 5, 5, 0], [4, 4, 2, 2]] assert np.allclose(expected_ytm, grouped_corpus.bag_term_matrix.todense()) grouped_corpus = corpus.group_by_time_period_optimized( time_period_specifier='decade') expected_ytm = [[2, 1, 4, 1], [8, 9, 7, 2]] assert np.allclose(expected_ytm, grouped_corpus.bag_term_matrix.todense()) grouped_corpus = corpus.group_by_time_period_optimized( time_period_specifier='year', fill_gaps=True) expected_ytm = np.matrix([ [2, 1, 4, 1], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [2, 2, 3, 0], [2, 3, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0], [4, 4, 2, 2], ]) assert np.allclose(expected_ytm, grouped_corpus.bag_term_matrix.todense()) assert len(grouped_corpus.document_index) == 9 assert is_strictly_increasing(grouped_corpus.document_index.index, sort_values=False)
def create_abc_corpus(dtm: List[List[int]], document_years: List[int] = None, token2id: dict = None) -> VectorizedCorpus: bag_term_matrix = np.array(dtm) token2id = token2id or { chr(ord('a') + i): i for i in range(0, bag_term_matrix.shape[1]) } years: List[int] = (document_years if document_years is not None else [2000 + i for i in range(0, bag_term_matrix.shape[0])]) document_index = pd.DataFrame({ 'year': years, 'filename': [f'{2000+i}_{i}.txt' for i in years], 'document_id': [i for i in range(0, bag_term_matrix.shape[0])], }) corpus: VectorizedCorpus = VectorizedCorpus(bag_term_matrix, token2id=token2id, document_index=document_index) return corpus
def test_from_token_ids_stream(): tokenized_corpus: MockedProcessedCorpus = mock_corpus() token2id: dict = tokenized_corpus.token2id id2token: dict = {v: k for k, v in tokenized_corpus.token2id.items()} """Arrange: simulate tagged ID frame payloads by turning corpus into a stream of document_id ✕ pd.Series""" document_index: pd.DataFrame = tokenized_corpus.document_index name2id = document_index.set_index('filename')['document_id'].to_dict().get tokens2series = lambda tokens: pd.Series([token2id[t] for t in tokens], dtype=np.int64) stream = [(name2id(filename), tokens2series(tokens)) for filename, tokens in tokenized_corpus] assert [id2token[t] for t in stream[0][1]] == tokenized_corpus.data[0][1] """Act: create a vectorized corpus out of stream""" vectorized_corpus: VectorizedCorpus = VectorizedCorpus.from_token_id_stream( stream, token2id, document_index) assert vectorized_corpus is not None """Check results""" expected_dtm = np.matrix([[2, 1, 4, 1], [2, 2, 3, 0], [2, 3, 2, 0], [2, 4, 1, 1], [2, 0, 1, 1]]) assert (vectorized_corpus.data.todense() == expected_dtm).all()
def test_pick_top_tf_map(corpus: VectorizedCorpus): assert corpus.pick_top_tf_map(2) == {'a': 10, 'c': 11}
def test_step_by_step_llr_compute_corpus_keyness_alternative(): bundle: Bundle = create_keyness_test_bundle(data=SIMPLE_CORPUS_ABCDE_3DOCS) opts: ComputeKeynessOpts = create_keyness_opts(keyness=KeynessMetric.HAL_cwr) corpus: VectorizedCorpus = bundle.corpus concept_corpus: VectorizedCorpus = bundle.concept_corpus token2id: Token2Id = bundle.token2id pivot_key: str = opts.pivot_column_name with inline_code(source=keyness.compute_weighed_corpus_keyness): zero_out_indices: Sequence[int] = corpus.zero_out_by_tf_threshold(3) concept_corpus.zero_out_by_indices(zero_out_indices) with inline_code(source=keyness.compute_corpus_keyness): corpus = corpus.group_by_time_period_optimized( time_period_specifier=opts.period_pivot, target_column_name=pivot_key, fill_gaps=opts.fill_gaps, aggregate='sum', ) # matrix([[3, 0, 0, 0, 3, 4, 8, 3, 5, 6, 3, 0, 3]]) rows = [] cols = [] data = [] pairs2token = ( corpus.vocabs_mapping.get ) # {(0, 2): 0, (0, 3): 2, (0, 4): 3, (0, 5): 10, (0, 6): 7, (2, 3): 1, (2, 4): 5, (2, 6): 11, (3, 4): 4, (3, 6): 12, (4, 5): 6, (4, 6): 9, (5, 6): 8} for document_id, term_term_matrix in corpus.to_term_term_matrix_stream(token2id): # 0, # matrix([[0, 0, 3, 0, 0, 3, 3], # [0, 0, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 4, 0, 0], # [0, 0, 0, 0, 3, 0, 3], # [0, 0, 0, 0, 0, 8, 6], # [0, 0, 0, 0, 0, 0, 5], # [0, 0, 0, 0, 0, 0, 0]]) meta_data = corpus.document_index[corpus.document_index.document_id == 0].to_dict('records')[0] weights, (w1_ids, w2_ids) = metrics.significance( TTM=term_term_matrix, metric=opts.keyness, normalize=opts.normalize, n_contexts=meta_data['n_documents'], n_words=meta_data['n_tokens'], ) # (array([-279.97270999, -23.03480975, -120.70153416, -85.94256279, # -17.99472463, -182.2522578 , -20.19035001, 144.74677931]), # (array([0, 0, 2, 3, 3, 4, 4, 5], dtype=int32), # array([5, 6, 4, 4, 6, 5, 6, 6], dtype=int32))) token_ids = (pairs2token(p) for p in zip(w1_ids, w2_ids)) rows.extend([document_id] * len(weights)) cols.extend(token_ids) data.extend(weights) bag_term_matrix = scipy.sparse.csr_matrix( (data, (rows, cols)), shape=(len(corpus.document_index), len(corpus.token2id)), dtype=np.float64, ) llr_corpus = VectorizedCorpus( bag_term_matrix=bag_term_matrix, token2id=corpus.token2id, document_index=corpus.document_index, ).remember(vocabs_mapping=corpus.vocabs_mapping) assert llr_corpus is not None pp(llr_corpus.data.todense())
def test_LEGACY_step_by_step_llr_compute_corpus_keyness(): bundle: Bundle = create_keyness_test_bundle(data=SIMPLE_CORPUS_ABCDE_3DOCS) opts: ComputeKeynessOpts = create_keyness_opts(keyness=KeynessMetric.LLR) corpus: VectorizedCorpus = bundle.corpus concept_corpus: VectorizedCorpus = bundle.concept_corpus token2id: Token2Id = bundle.token2id pivot_key: str = opts.pivot_column_name with inline_code(source=keyness.compute_weighed_corpus_keyness): zero_out_indices: Sequence[int] = corpus.zero_out_by_tf_threshold(3) concept_corpus.zero_out_by_indices(zero_out_indices) with inline_code(source=keyness.compute_corpus_keyness): corpus = corpus.group_by_time_period_optimized( time_period_specifier=opts.period_pivot, target_column_name=pivot_key, fill_gaps=opts.fill_gaps, aggregate='sum', ) # matrix([[3, 0, 0, 0, 3, 4, 8, 3, 5, 6, 3, 0, 3]]) """Current implementation""" with inline_code(source=ttm_legacy.LegacyCoOccurrenceMixIn.to_keyness_co_occurrence_corpus): with inline_code(source=ttm_legacy.LegacyCoOccurrenceMixIn.to_keyness_co_occurrences): co_occurrences: pd.DataFrame = corpus.to_co_occurrences(token2id) with inline_code(source=metrics.partitioned_significances): vocabulary_size: int = len(token2id) co_occurrence_partitions = [] for period in co_occurrences[pivot_key].unique(): pivot_co_occurrences = co_occurrences[co_occurrences[pivot_key] == period] term_term_matrix = scipy.sparse.csc_matrix( (pivot_co_occurrences.value, (pivot_co_occurrences.w1_id, pivot_co_occurrences.w2_id)), shape=(vocabulary_size, vocabulary_size), dtype=np.float64, ) n_contexts = metrics._get_documents_count(corpus.document_index, pivot_co_occurrences) weights, (w1_ids, w2_ids) = metrics.significance( TTM=term_term_matrix, metric=opts.keyness, normalize=opts.normalize, n_contexts=n_contexts, ) co_occurrence_partitions.append( pd.DataFrame( data={pivot_key: period, 'w1_id': w1_ids, 'w2_id': w2_ids, 'value': weights} ) ) keyness_co_occurrences = pd.concat(co_occurrence_partitions, ignore_index=True) mg = corpus.get_token_ids_2_pair_id(token2id=token2id).get keyness_co_occurrences['token_id'] = [ mg((x[0].item(), x[1].item())) for x in keyness_co_occurrences[['w1_id', 'w2_id']].to_records(index=False) ] with inline_code(source=ttm_legacy.LegacyCoOccurrenceMixIn._to_co_occurrence_matrix): pg: Callable = {v: k for k, v in corpus.document_index[pivot_key].to_dict().items()}.get llr_matrix: scipy.sparse.spmatrix = scipy.sparse.coo_matrix( ( keyness_co_occurrences.value, ( keyness_co_occurrences[pivot_key].apply(pg).astype(np.int32), keyness_co_occurrences.token_id.astype(np.int32), ), ), shape=corpus.data.shape, ) llr_corpus: VectorizedCorpus = VectorizedCorpus( bag_term_matrix=llr_matrix, token2id=corpus.token2id, document_index=corpus.document_index, vocabs_mapping=corpus.vocabs_mapping, ) assert llr_corpus is not None pp(llr_corpus.data.todense())
def test_to_n_top_dataframe(corpus: VectorizedCorpus): assert corpus.to_n_top_dataframe(1) is not None
def test_token_indices(corpus: VectorizedCorpus): assert corpus.token_indices(['a', 'c', 'z']) == [0, 2]
def test_tf_idf(corpus: VectorizedCorpus): assert corpus.tf_idf() is not None
def test_get_top_n_words(corpus: VectorizedCorpus): assert corpus.get_top_n_words(n=2) == [('c', 11), ('a', 10)]
import os from penelope.common.curve_fit import pchip_spline from penelope.common.keyness.metrics import KeynessMetric # , rolling_average_smoother from penelope.corpus import VectorizedCorpus from penelope.notebook.word_trends.displayers import TopTokensDisplayer from penelope.notebook.word_trends.interface import TrendsComputeOpts # pylint: disable=protected-access DEFAULT_SMOOTHERS = [pchip_spline] folder = "/path/to/data" tag = os.path.split(folder)[1] corpus: VectorizedCorpus = VectorizedCorpus.load(folder=folder, tag=tag) compute_opts: TrendsComputeOpts = TrendsComputeOpts(normalize=False, keyness=KeynessMetric.TF, temporal_key='year') top_tokens = corpus.get_top_n_words(n=100000) displayer: TopTokensDisplayer = TopTokensDisplayer() displayer.setup() indices = [x[1] for x in top_tokens] smooth = False plot_data = displayer._compile(corpus=corpus, compute_opts=compute_opts, indices=indices, smoothers=[DEFAULT_SMOOTHERS] if smooth else [])
def test_stats(corpus: VectorizedCorpus): assert corpus.stats() is not None
def test_load_dumped_corpus(mode: str, vectorized_corpus: VectorizedCorpus): tag: str = f'{str(uuid.uuid1())[:6]}' folder: str = jj(OUTPUT_FOLDER, tag) os.makedirs(folder, exist_ok=True) vectorized_corpus.dump(tag=tag, folder=folder, compressed=True, mode=mode) assert VectorizedCorpus.dump_exists(tag=tag, folder=folder) assert VectorizedCorpus.find_tags(folder) == [tag] loaded_corpus: VectorizedCorpus = VectorizedCorpus.load(tag=tag, folder=folder) assert (vectorized_corpus.term_frequency == loaded_corpus.term_frequency ).all() assert vectorized_corpus.document_index.to_dict( ) == loaded_corpus.document_index.to_dict() assert vectorized_corpus.token2id == loaded_corpus.token2id loaded_options: dict = VectorizedCorpus.load_options(tag=tag, folder=folder) assert loaded_options == dict() VectorizedCorpus.dump_options(tag=tag, folder=folder, options=dict(apa=1)) loaded_options: dict = VectorizedCorpus.load_options(tag=tag, folder=folder) assert loaded_options == dict(apa=1) VectorizedCorpus.remove(tag=tag, folder=folder) assert not VectorizedCorpus.dump_exists(tag=tag, folder=folder) assert not VectorizedCorpus.find_tags(folder) shutil.rmtree(folder)