def get_reordered_words(c, s): # A reordered word is a word that is contained in the source and simplification # but not in the longuest common subsequence c = c.lower() s = s.lower() lcs = get_lcs(to_words(c), to_words(s)) return flatten_counter(Counter(get_kept_words(c, s)) - Counter(lcs))
def get_wordrank_score(sentence): # Computed as the third quartile of log ranks words = to_words(remove_stopwords(remove_punctuation_tokens(sentence))) words = [word for word in words if word in get_word2rank()] if len(words) == 0: return np.log(1 + len(get_word2rank())) # TODO: This is completely arbitrary return np.quantile([get_log_rank(word) for word in words], 0.75)
def to_embeddings(sentence): if 'EMBEDDINGS' not in globals(): global EMBEDDINGS, WORD2INDEX print('Loading FastText embeddings...') EMBEDDINGS, WORD2INDEX = load_fasttext_embeddings(vocab_size=100000) print('Done.') sentence = sentence.lower() # Fasttext embeddings are lowercase indexes = [WORD2INDEX.get(word, WORD2INDEX['<unk>']) for word in to_words(sentence)] return EMBEDDINGS[indexes]
def get_frequency_table_ranks(sentence): return np.log(1 + np.array([get_rank(word) for word in to_words(sentence)]))
def get_concreteness_scores(sentence): return np.log( 1 + np.array([get_concreteness(word) for word in to_words(sentence)]))
def only_deleted_words(c, s): # Only counting deleted words does not work because sometimes there is reordering return not is_exact_match(c, s) and get_lcs(to_words(c), to_words(s)) == to_words(s)
def get_kept_words(c, s): return flatten_counter(Counter(to_words(c)) & Counter(to_words(s)))
def get_deleted_words(c, s): return flatten_counter(Counter(to_words(c)) - Counter(to_words(s)))
def get_deletions_proportion(complex_sentence, simple_sentence): n_deletions = sum((Counter(to_words(complex_sentence)) - Counter(to_words(simple_sentence))).values()) return n_deletions / max(count_words(complex_sentence), count_words(simple_sentence))
def word_intersection(complex_sentence, simple_sentence): complex_words = to_words(complex_sentence) simple_words = to_words(simple_sentence) nb_common_words = len(set(complex_words).intersection(set(simple_words))) nb_max_words = max(len(set(complex_words)), len(set(simple_words))) return nb_common_words / nb_max_words