예제 #1
0
def get_strictly_increasing_document_id(
        document_index: DocumentIndex,
        document_id_field: str = 'document_id') -> pd.Series:
    """[summary]

    Args:
        document_index (DocumentIndex): [description]
        document_id_field (str): [description]

    Returns:
        pd.Series: [description]
    """

    if document_id_field in document_index.columns:
        if is_strictly_increasing(document_index[document_id_field]):
            return document_index[document_id_field]

    if is_strictly_increasing(document_index.index):
        return document_index.index

    if document_index.index.dtype == np.dtype('int64'):
        # Logic from deprecated document_index_upgrade() should never happen
        raise ValueError(
            "Integer index encountered that are not strictly increasing!")
        # if 'document_id' not in document_index.columns:
        #     document_index['document_id'] = document_index.index

    return document_index.reset_index().index
예제 #2
0
def test_group_by_time_period_aggregates_DTM_to_PTM():

    bag_term_matrix = np.array([[2, 1, 4, 1], [2, 2, 3, 0], [2, 3, 2, 0],
                                [2, 4, 1, 1], [2, 0, 1, 1]])
    token2id = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
    document_index = pd.DataFrame({
        'year': [2009, 2013, 2014, 2017, 2017],
        'filename':
        ['2009.txt', '2013.txt', '2014.txt', '2017.txt', '2017.txt'],
        'document_id': [0, 1, 2, 3, 4],
    })
    corpus = VectorizedCorpus(bag_term_matrix,
                              token2id=token2id,
                              document_index=document_index)

    grouped_corpus = corpus.group_by_time_period_optimized(
        time_period_specifier='year')
    expected_ytm = [[2, 1, 4, 1], [2, 2, 3, 0], [2, 3, 2, 0], [4, 4, 2, 2]]
    assert np.allclose(expected_ytm, grouped_corpus.bag_term_matrix.todense())

    grouped_corpus = corpus.group_by_time_period_optimized(
        time_period_specifier='lustrum')
    expected_ytm = [[2, 1, 4, 1], [4, 5, 5, 0], [4, 4, 2, 2]]
    assert np.allclose(expected_ytm, grouped_corpus.bag_term_matrix.todense())

    grouped_corpus = corpus.group_by_time_period_optimized(
        time_period_specifier='decade')
    expected_ytm = [[2, 1, 4, 1], [8, 9, 7, 2]]
    assert np.allclose(expected_ytm, grouped_corpus.bag_term_matrix.todense())

    grouped_corpus = corpus.group_by_time_period_optimized(
        time_period_specifier='year', fill_gaps=True)
    expected_ytm = np.matrix([
        [2, 1, 4, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [2, 2, 3, 0],
        [2, 3, 2, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [4, 4, 2, 2],
    ])
    assert np.allclose(expected_ytm, grouped_corpus.bag_term_matrix.todense())
    assert len(grouped_corpus.document_index) == 9
    assert is_strictly_increasing(grouped_corpus.document_index.index,
                                  sort_values=False)
예제 #3
0
    def _ingest_document_index(self, document_index: DocumentIndex):

        if not np.issubdtype(document_index.index.dtype, np.number):
            logger.warning(
                "VectorizedCorpus: supplied document index has not an integral index"
            )
            document_index = document_index.set_index(
                'document_id', drop=False).rename_axis('')

        if not utility.is_strictly_increasing(document_index.index):
            raise ValueError(
                "supplied `document index` must have an integer typed, strictly increasing index starting from 0"
            )
        if len(document_index) != self._bag_term_matrix.shape[0]:
            raise ValueError(
                f"expected `document index` to have length {self._bag_term_matrix.shape[0]} but found length {len(document_index)}"
            )

        if 'n_raw_tokens' not in document_index.columns:
            document_index['n_raw_tokens'] = self.document_token_counts

        return document_index
예제 #4
0
def test_is_strictly_increasing():
    assert is_strictly_increasing(pd.Series([0, 1, 2], dtype=np.int),
                                  by_value=1)
    assert is_strictly_increasing(pd.Series([0, 1, 2], dtype=np.int),
                                  by_value=1,
                                  start_value=0,
                                  sort_values=False)
    assert not is_strictly_increasing(pd.Series([0, 1, 2], dtype=np.int),
                                      by_value=2,
                                      start_value=0,
                                      sort_values=False)
    assert not is_strictly_increasing(pd.Series([0, 1, 2], dtype=np.int),
                                      by_value=1,
                                      start_value=1,
                                      sort_values=False)
    assert not is_strictly_increasing(pd.Series([1, 2, 3], dtype=np.int),
                                      by_value=1,
                                      start_value=0,
                                      sort_values=False)
    assert is_strictly_increasing(pd.Series([1, 2, 3], dtype=np.int),
                                  by_value=1,
                                  start_value=1,
                                  sort_values=False)
    assert is_strictly_increasing(pd.Series([1, 2, 3], dtype=np.int),
                                  by_value=1,
                                  start_value=None,
                                  sort_values=False)
    assert not is_strictly_increasing(pd.Series([3, 2, 1], dtype=np.int),
                                      by_value=1,
                                      start_value=None,
                                      sort_values=False)
    assert is_strictly_increasing(pd.Series([3, 2, 1], dtype=np.int),
                                  by_value=1,
                                  start_value=None,
                                  sort_values=True)
    assert is_strictly_increasing(pd.Series([0, 10, 20], dtype=np.int),
                                  by_value=10,
                                  start_value=0,
                                  sort_values=True)

    assert not is_strictly_increasing(pd.Series([0, -1, 2], dtype=np.int))
    assert not is_strictly_increasing(pd.Series(['a', 'b', 'c']))