def test_corpus_replace_characters_simple(): c = Corpus({'doc1': 'ABC', 'doc2': 'abcDeF'}) c.replace_characters({'a': None, 'C': 'c', 'e': ord('X')}) assert c.docs == { 'doc1': 'ABc', 'doc2': 'bcDXF', } c.replace_characters({ord('A'): None}) assert c.docs == { 'doc1': 'Bc', 'doc2': 'bcDXF', } c.replace_characters(str.maketrans('DXFY', '1234')) assert c.docs == { 'doc1': 'Bc', 'doc2': 'bc123', } c.replace_characters({}) assert c.docs == { 'doc1': 'Bc', 'doc2': 'bc123', }
'̃': None, '̆': None, 'ҫ': 'ç', # they look the same but they aren't '‘': None, '’': None, '‚': ',', '“': None, '”': None, '„': None, '…': None, '\u202f': None, '�': None } print('replacing characters in each document of the corpus') corpus.replace_characters(char_transl_table) print('these non-ASCII characters are left:') pprint(corpus.unique_characters - set(string.printable)) #%% Correct contractions # some contractions have a stray space in between, like "EU -Hilfen" where it should be "EU-Hilfen" # correct this by applying a custom function with a regular expression (RE) to each document in the corpus pttrn_contraction_ws = re.compile(r'(\w+)(\s+)(-\w+)') print('correcting wrong contractions') # in each document text `t`, remove the RE group 2 (the stray white space "(\s+)") for each match `m` corpus.apply(lambda t: pttrn_contraction_ws.sub(lambda m: m.group(1) + m.group(3), t)) #%% Create a TMPreproc object for token processing