def next_any_word_except(banned) -> Parse: """parser constructor that matches any next word except banned. Matching on banned words is up to synonym.""" bansyn = [synonymize(lexer.singularize(b)) for b in banned] def p(tok): return not(tok.value in bansyn) return next_any_word().if_test(p)
def next_word(s:str) -> Parse: #was_next_word_syn """parser constructor that matches next word s, up to synonym""" #if len(s) < MIN_LEN_SYNONYM: # return next_word_exact(s) syn = synonymize(lexer.singularize(s)) #def p(tok): # return tok.type == 'WORD' and synw(tok)==syn return next_any_word().if_value(syn).expect(s)
def synonym_add(ts): """add synonym list to dictionary""" #XX Debug: should check that at most one variant in ts is defined anywhere. for s in ts: if len(s.split(' '))> 1: return msg.error(f'synonym entries must be single words:{s}') if lexer.singularize(s) in synonym: return msg.error(f'synonym already declared: {s}') # len restriction prevents VAR from being added to dict. if len(s) < MIN_LEN_SYNONYM: return msg.error(f'synonyms must have at least {MIN_LEN_SYNONYM} chars: {s}') if not(s.isalpha()): return msg.error(f'synonyms must be words: {s}') ls = [lexer.singularize(s) for s in ts] ls.sort() js = ' '.join(ls) for s in ls: synonym[s] = js
def next_word(s:str) -> Parse: #was_next_word_syn """parser constructor that matches next word s, up to synonym, singularization, and case. >>> pstream(next_word('trial'),'Trials x') LexToken(WORD,'trial',1,0) """ syn = synonymize(lexer.singularize(s.lower())) def p(tok): return syn == synonymize(tok.value.lower()) return next_any_word().if_test(p).name(s).setsample(sample.if_value(s))
def synonym_add(ts): """add synonym list to dictionary. All the words in the list are singularized, then made synonymous. The canonical form of the group of synonyms is created.""" #XX Debug: should check that at most one variant in ts is defined anywhere. for s in ts: if len(s.split(' '))> 1: raise DataProcess(f'synonym entries must be single words:{s}') if lexer.singularize(s) in synonym: raise DataProcess(f'synonym already declared: {s}') # len restriction prevents VAR from being added to dict. if len(s) < MIN_LEN_SYNONYM: raise DataProcess(f'synonyms must have at least {MIN_LEN_SYNONYM} chars: {s}') if not(s.isalpha()): raise DataProcess(f'synonyms must be words: {s}') #make the canonical_form ls = [lexer.singularize(s) for s in ts] ls.sort() canonical_form = ' '.join(ls) #record the canonical_form as the key for s in ls: synonym[s] = canonical_form
def next_any_word_except(banned) -> Parse: """parser constructor that matches any next WORD token except banned. Matching is up to synonym, singularization, and case. >>> try: ... pstream(next_any_word_except(['trial']),'Trials x') ... except ParseError: ... print('exception') exception >>> pstream(next_any_word_except(['trail']),'Trials x') LexToken(WORD,'trial',1,0) """ bansyn = [synonymize(lexer.singularize(b.lower())) for b in banned] def p(tok): return not(synonymize(tok.value.lower()) in bansyn) return Parse.next_token().if_types(['WORD']).if_test(p).setsample(sample.if_types(['WORD']))
def test_singular(): for key in singular: assert_true('singular.' + key, lexer.singularize(key) == singular[key])
def not_key(s): return not (lexer.singularize(s.lower()) in pattern_key)