示例#1
0
def get_reordered_words(c, s):
    # A reordered word is a word that is contained in the source and simplification
    # but not in the longuest common subsequence
    c = c.lower()
    s = s.lower()
    lcs = get_lcs(to_words(c), to_words(s))
    return flatten_counter(Counter(get_kept_words(c, s)) - Counter(lcs))
示例#2
0
def get_wordrank_score(sentence):
    # Computed as the third quartile of log ranks
    words = to_words(remove_stopwords(remove_punctuation_tokens(sentence)))
    words = [word for word in words if word in get_word2rank()]
    if len(words) == 0:
        return np.log(1 + len(get_word2rank()))  # TODO: This is completely arbitrary
    return np.quantile([get_log_rank(word) for word in words], 0.75)
def to_embeddings(sentence):
    if 'EMBEDDINGS' not in globals():
        global EMBEDDINGS, WORD2INDEX
        print('Loading FastText embeddings...')
        EMBEDDINGS, WORD2INDEX = load_fasttext_embeddings(vocab_size=100000)
        print('Done.')
    sentence = sentence.lower()  # Fasttext embeddings are lowercase
    indexes = [WORD2INDEX.get(word, WORD2INDEX['<unk>']) for word in to_words(sentence)]
    return EMBEDDINGS[indexes]
示例#4
0
def get_frequency_table_ranks(sentence):
    return np.log(1 + np.array([get_rank(word)
                                for word in to_words(sentence)]))
示例#5
0
def get_concreteness_scores(sentence):
    return np.log(
        1 + np.array([get_concreteness(word) for word in to_words(sentence)]))
示例#6
0
def only_deleted_words(c, s):
    # Only counting deleted words does not work because sometimes there is reordering
    return not is_exact_match(c, s) and get_lcs(to_words(c),
                                                to_words(s)) == to_words(s)
示例#7
0
def get_kept_words(c, s):
    return flatten_counter(Counter(to_words(c)) & Counter(to_words(s)))
示例#8
0
def get_deleted_words(c, s):
    return flatten_counter(Counter(to_words(c)) - Counter(to_words(s)))
示例#9
0
def get_deletions_proportion(complex_sentence, simple_sentence):
    n_deletions = sum((Counter(to_words(complex_sentence)) -
                       Counter(to_words(simple_sentence))).values())
    return n_deletions / max(count_words(complex_sentence),
                             count_words(simple_sentence))
示例#10
0
def word_intersection(complex_sentence, simple_sentence):
    complex_words = to_words(complex_sentence)
    simple_words = to_words(simple_sentence)
    nb_common_words = len(set(complex_words).intersection(set(simple_words)))
    nb_max_words = max(len(set(complex_words)), len(set(simple_words)))
    return nb_common_words / nb_max_words