def suitable_candidate(tokens, i, c): """Checks if candidate is suitable for simplification.""" # Checks candidate is less complex than original word. less_complex = zfreq(c[0], LANG) > zfreq(tokens[i], LANG) # Checks candidate is not morphological derivation. not_morph = c[0] not in tokens[i] and tokens[i] not in c[0] return less_complex and not_morph
def simplify_token(tokens, i): """Simplifies a token given index and all tokens.""" # Generate candidates using BERT. candidates = generate_candidates(tokens, i) # Start ranking candidates on different features. candidates = [c for c in candidates if suitable_candidate(tokens[i], c)][::-1] complex_ranked = sorted(candidates, key=lambda c: zfreq(c, config.lang)) if models.embeddings: # If WEs have been loaded, include cosine and apsynp metrics. cosine_ranked = sorted(candidates, key=lambda c: cosine_sim(tokens[i], c)) apsynp_ranked = sorted(candidates, key=lambda c: apsyn_sim(tokens[i], c)) overall_ranked = [(c, candidates.index(c) + cosine_ranked.index(c) + apsynp_ranked.index(c) + complex_ranked.index(c)) for c in candidates] else: # If WEs have not been loaded, only use BERT and frequency. overall_ranked = [(c, candidates.index(c) + complex_ranked.index(c)) for c in candidates] # Sort candidates based on overall rank. overall_ranked = sorted(overall_ranked, key=lambda c: c[1]) return overall_ranked if overall_ranked else []
def suitable_complex_word(w): """Checks if detected word is suitable for replacing.""" # Not stopword or punctuation. not_stopword = w not in safe_get_stop_words(config.lang) and w.isalpha() # Not a simple word (above defined threshold). not_simple = zfreq(w, config.lang) < config.min_complexity # No uppercase (ensures NEs are not simplified). not_uppercase = w.islower() return not_stopword and not_simple and not_uppercase
def suitable_candidate(w, c): """Checks if candidate is a suitable substitute based on various criteria.""" source_stem = stemmer.stem(w) candidate_stem = stemmer.stem(c) # Check stem length. not_stem_len = not (len(candidate_stem) >= 3 and candidate_stem[:3] == source_stem[:3]) # Not sharing stem with original word. not_equal_stem = source_stem != candidate_stem # Not punctuation not_punctuation = c.isalpha() # Other checks (disable when benchmarking). not_morph_deriv = c not in w and w not in c not_complex = zfreq(c, config.lang) > zfreq(w, config.lang) not_stopword = c not in safe_get_stop_words(config.lang) and c.isalpha() return not_equal_stem and not_stem_len and not_morph_deriv and not_stopword and not_complex
def simplify_token(tokens, i): """Simplifies a token given index and all tokens.""" # Get top 10 similar words and remove those that are not suitable. candidates = wv_model.most_similar(tokens[i], topn=CANDIDATE_NO) candidates = [c for c in candidates if suitable_candidate(tokens, i, c)] # Rank candidates based on features. syntactic_ranked = sorted(candidates, key=lambda c: c[1]) complexity_ranked = sorted(candidates, key=lambda c: zfreq(c[0], LANG)) context_ranked = sorted( candidates, key=lambda c: context_sim(tokens, i, c[0], WINDOW_SIZE)) # Calculate overall rank for each candidate. overall_ranked = [(c[0], syntactic_ranked.index(c) + complexity_ranked.index(c) + context_ranked.index(c)) for c in candidates] return overall_ranked
overall_ranked = [(c[0], syntactic_ranked.index(c) + complexity_ranked.index(c) + context_ranked.index(c)) for c in candidates] return overall_ranked if __name__ == '__main__': # If supplied, set text to user input. if len(sys.argv) > 1: raw_str = sys.argv[1] else: raw_str = "This is a particularly convoluted test sentence requiring simplification." # Copy of tokens to prevent changes to original. tokens = tokenize(raw_str) tokens_copy = tokens.copy() # For each copied token for i in range(len(tokens_copy)): # Make current word lowercase. tokens_copy[i] = tokens_copy[i].lower() # Conditions ensuring only valid and complex words are simplified. word_valid = tokens_copy[i] in wv_model.vocab and tokens_copy[ i] not in STOPWORDS word_complex = zfreq(tokens_copy[i], LANG) < MIN_COMPLEXITY # Only simplify tokens in model and not in stopwords. if word_valid and word_complex: result = simplify_token(tokens_copy, i) print("Results for '" + tokens_copy[i] + "' - " + str(result) + "\n")