예제 #1
0
def test_compute_ttm_alternative_method():

    corpus: TokenizedCorpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_3DOCS)
    corpus.token2id['*'] = (pad_id := len(corpus.token2id))
    id2token: dict = corpus.id2token

    """Convert corpus to numeric ids """
    corpus_token_ids = [[corpus.token2id[t] for t in tokens] for _, tokens in corpus]

    corpus_document_windows = [
        [
            w
            for w in generate_windows(
                token_ids=token_ids,
                context_width=1,
                pad_id=pad_id,
                ignore_pads=False,
            )
        ]
        for token_ids in corpus_token_ids
    ]

    corpus_document_text_windows = [[''.join([id2token[t] for t in w]) for w in d] for d in corpus_document_windows]
    corpus_document_text_windows = flatten(corpus_document_text_windows)

    assert corpus_document_text_windows == flatten(
        [
            ['*ab', 'abc', 'bcc', 'ccd', 'cdc', 'dce', 'ce*'],
            ['*aa', 'aac', 'ace', 'cec', 'ecd', 'cdd', 'dd*'],
            ['*de', 'dee', 'eeb', 'eb*'],
        ]
    )

    co_occurrence_instances = flatten(list(map(''.join, combinations(x, 2))) for x in corpus_document_text_windows)
    co_occurrence_counts = collections.Counter(
        x if x[0] < x[1] else x[::-1] for x in co_occurrence_instances if x[0] != x[1]
    )

    assert dict(co_occurrence_counts) == {
        '*a': 3,
        '*b': 2,
        'ab': 2,
        'ac': 4,
        'bc': 3,
        'cd': 8,
        'de': 5,
        'ce': 6,
        '*c': 1,
        '*e': 3,
        'ae': 1,
        '*d': 3,
        'be': 3,
    }

    assert True
예제 #2
0
파일: bundle.py 프로젝트: humlab/penelope
 def _token_ids_to_keep(kept_pair_ids: Set[int]) -> List[int]:
     token_ids_in_kept_pairs: Set[int] = set(
         flatten((k for k, pair_id in self.token_ids_2_pair_id.items()
                  if pair_id in kept_pair_ids)))
     kept_token_ids: List[int] = sorted(
         list(
             token_ids_in_kept_pairs.union(
                 set(self.token2id.magic_token_ids))))
     return kept_token_ids
예제 #3
0
def test_dict_of_key_values_to_dict_of_value_key():
    x = {'a': [1, 2], 'b': [3, 4]}
    y = {
        k: v
        for k, v in utility.flatten([[(v, k) for v in l]
                                     for k, l in x.items()])
    }
    assert y == {1: 'a', 2: 'a', 3: 'b', 4: 'b'}

    # from itertools import chain

    # y = dict(chain(*[[(v, k) for v in l] for k, l in x.items()]))
    # assert y == {1: 'a', 2: 'a', 3: 'b', 4: 'b'}

    y = {value: key for key in x for value in x[key]}
    assert y == {1: 'a', 2: 'a', 3: 'b', 4: 'b'}
예제 #4
0
 def generate_document(self, words):
     if isinstance(words, str):
         document = words.split()
     else:
         document = flatten([n * w for n, w in words])
     return document
예제 #5
0
 def create_token2id(self) -> Token2Id:
     return Token2Id({
         w: i
         for i, w in enumerate(
             sorted(list(set(flatten([x[1] for x in self.data])))))
     })
예제 #6
0
파일: utils.py 프로젝트: humlab/penelope
def bow2text(document: List[Tuple[int, int]], id2token: Dict[int, str]) -> str:
    """Creates a text corpus out of a BoW corpus, repeating words in sequence."""
    return ' '.join(
        flatten([f * [id2token[token_id]] for token_id, f in document]))