def test_compute_ttm_alternative_method(): corpus: TokenizedCorpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_3DOCS) corpus.token2id['*'] = (pad_id := len(corpus.token2id)) id2token: dict = corpus.id2token """Convert corpus to numeric ids """ corpus_token_ids = [[corpus.token2id[t] for t in tokens] for _, tokens in corpus] corpus_document_windows = [ [ w for w in generate_windows( token_ids=token_ids, context_width=1, pad_id=pad_id, ignore_pads=False, ) ] for token_ids in corpus_token_ids ] corpus_document_text_windows = [[''.join([id2token[t] for t in w]) for w in d] for d in corpus_document_windows] corpus_document_text_windows = flatten(corpus_document_text_windows) assert corpus_document_text_windows == flatten( [ ['*ab', 'abc', 'bcc', 'ccd', 'cdc', 'dce', 'ce*'], ['*aa', 'aac', 'ace', 'cec', 'ecd', 'cdd', 'dd*'], ['*de', 'dee', 'eeb', 'eb*'], ] ) co_occurrence_instances = flatten(list(map(''.join, combinations(x, 2))) for x in corpus_document_text_windows) co_occurrence_counts = collections.Counter( x if x[0] < x[1] else x[::-1] for x in co_occurrence_instances if x[0] != x[1] ) assert dict(co_occurrence_counts) == { '*a': 3, '*b': 2, 'ab': 2, 'ac': 4, 'bc': 3, 'cd': 8, 'de': 5, 'ce': 6, '*c': 1, '*e': 3, 'ae': 1, '*d': 3, 'be': 3, } assert True
def _token_ids_to_keep(kept_pair_ids: Set[int]) -> List[int]: token_ids_in_kept_pairs: Set[int] = set( flatten((k for k, pair_id in self.token_ids_2_pair_id.items() if pair_id in kept_pair_ids))) kept_token_ids: List[int] = sorted( list( token_ids_in_kept_pairs.union( set(self.token2id.magic_token_ids)))) return kept_token_ids
def test_dict_of_key_values_to_dict_of_value_key(): x = {'a': [1, 2], 'b': [3, 4]} y = { k: v for k, v in utility.flatten([[(v, k) for v in l] for k, l in x.items()]) } assert y == {1: 'a', 2: 'a', 3: 'b', 4: 'b'} # from itertools import chain # y = dict(chain(*[[(v, k) for v in l] for k, l in x.items()])) # assert y == {1: 'a', 2: 'a', 3: 'b', 4: 'b'} y = {value: key for key in x for value in x[key]} assert y == {1: 'a', 2: 'a', 3: 'b', 4: 'b'}
def generate_document(self, words): if isinstance(words, str): document = words.split() else: document = flatten([n * w for n, w in words]) return document
def create_token2id(self) -> Token2Id: return Token2Id({ w: i for i, w in enumerate( sorted(list(set(flatten([x[1] for x in self.data]))))) })
def bow2text(document: List[Tuple[int, int]], id2token: Dict[int, str]) -> str: """Creates a text corpus out of a BoW corpus, repeating words in sequence.""" return ' '.join( flatten([f * [id2token[token_id]] for token_id, f in document]))