def get_strictly_increasing_document_id( document_index: DocumentIndex, document_id_field: str = 'document_id') -> pd.Series: """[summary] Args: document_index (DocumentIndex): [description] document_id_field (str): [description] Returns: pd.Series: [description] """ if document_id_field in document_index.columns: if is_strictly_increasing(document_index[document_id_field]): return document_index[document_id_field] if is_strictly_increasing(document_index.index): return document_index.index if document_index.index.dtype == np.dtype('int64'): # Logic from deprecated document_index_upgrade() should never happen raise ValueError( "Integer index encountered that are not strictly increasing!") # if 'document_id' not in document_index.columns: # document_index['document_id'] = document_index.index return document_index.reset_index().index
def test_group_by_time_period_aggregates_DTM_to_PTM(): bag_term_matrix = np.array([[2, 1, 4, 1], [2, 2, 3, 0], [2, 3, 2, 0], [2, 4, 1, 1], [2, 0, 1, 1]]) token2id = {'a': 0, 'b': 1, 'c': 2, 'd': 3} document_index = pd.DataFrame({ 'year': [2009, 2013, 2014, 2017, 2017], 'filename': ['2009.txt', '2013.txt', '2014.txt', '2017.txt', '2017.txt'], 'document_id': [0, 1, 2, 3, 4], }) corpus = VectorizedCorpus(bag_term_matrix, token2id=token2id, document_index=document_index) grouped_corpus = corpus.group_by_time_period_optimized( time_period_specifier='year') expected_ytm = [[2, 1, 4, 1], [2, 2, 3, 0], [2, 3, 2, 0], [4, 4, 2, 2]] assert np.allclose(expected_ytm, grouped_corpus.bag_term_matrix.todense()) grouped_corpus = corpus.group_by_time_period_optimized( time_period_specifier='lustrum') expected_ytm = [[2, 1, 4, 1], [4, 5, 5, 0], [4, 4, 2, 2]] assert np.allclose(expected_ytm, grouped_corpus.bag_term_matrix.todense()) grouped_corpus = corpus.group_by_time_period_optimized( time_period_specifier='decade') expected_ytm = [[2, 1, 4, 1], [8, 9, 7, 2]] assert np.allclose(expected_ytm, grouped_corpus.bag_term_matrix.todense()) grouped_corpus = corpus.group_by_time_period_optimized( time_period_specifier='year', fill_gaps=True) expected_ytm = np.matrix([ [2, 1, 4, 1], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [2, 2, 3, 0], [2, 3, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0], [4, 4, 2, 2], ]) assert np.allclose(expected_ytm, grouped_corpus.bag_term_matrix.todense()) assert len(grouped_corpus.document_index) == 9 assert is_strictly_increasing(grouped_corpus.document_index.index, sort_values=False)
def _ingest_document_index(self, document_index: DocumentIndex): if not np.issubdtype(document_index.index.dtype, np.number): logger.warning( "VectorizedCorpus: supplied document index has not an integral index" ) document_index = document_index.set_index( 'document_id', drop=False).rename_axis('') if not utility.is_strictly_increasing(document_index.index): raise ValueError( "supplied `document index` must have an integer typed, strictly increasing index starting from 0" ) if len(document_index) != self._bag_term_matrix.shape[0]: raise ValueError( f"expected `document index` to have length {self._bag_term_matrix.shape[0]} but found length {len(document_index)}" ) if 'n_raw_tokens' not in document_index.columns: document_index['n_raw_tokens'] = self.document_token_counts return document_index
def test_is_strictly_increasing(): assert is_strictly_increasing(pd.Series([0, 1, 2], dtype=np.int), by_value=1) assert is_strictly_increasing(pd.Series([0, 1, 2], dtype=np.int), by_value=1, start_value=0, sort_values=False) assert not is_strictly_increasing(pd.Series([0, 1, 2], dtype=np.int), by_value=2, start_value=0, sort_values=False) assert not is_strictly_increasing(pd.Series([0, 1, 2], dtype=np.int), by_value=1, start_value=1, sort_values=False) assert not is_strictly_increasing(pd.Series([1, 2, 3], dtype=np.int), by_value=1, start_value=0, sort_values=False) assert is_strictly_increasing(pd.Series([1, 2, 3], dtype=np.int), by_value=1, start_value=1, sort_values=False) assert is_strictly_increasing(pd.Series([1, 2, 3], dtype=np.int), by_value=1, start_value=None, sort_values=False) assert not is_strictly_increasing(pd.Series([3, 2, 1], dtype=np.int), by_value=1, start_value=None, sort_values=False) assert is_strictly_increasing(pd.Series([3, 2, 1], dtype=np.int), by_value=1, start_value=None, sort_values=True) assert is_strictly_increasing(pd.Series([0, 10, 20], dtype=np.int), by_value=10, start_value=0, sort_values=True) assert not is_strictly_increasing(pd.Series([0, -1, 2], dtype=np.int)) assert not is_strictly_increasing(pd.Series(['a', 'b', 'c']))