Python remove_punctuation 예제들, features_util.remove_punctuation Python 예제들

예제 #1

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def mean_sent_len_char(sentences):
    """Return mean sentence length in characters."""
    sentlen_char = map(
        lambda sentence: sum(
            map(lambda word: len(word), f_utils.remove_punctuation(sentence))),
        sentences)
    return sum(sentlen_char) / len(sentlen_char)

예제 #2

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def digit_words_19(sentences):
    """Return ratio digit words <= 19 / digits <= 19
        + single ratios / words."""
    counter_digwords = Counter()
    counter_digits = Counter()
    total_words = 0
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        total_words += len(s)
        counter_digwords += Counter(
            filter(lambda word: word in _DIGITWORDS_19, s))
        counter_digits += Counter(
            filter(lambda word: word.isdigit() and float(word) in range(0, 20),
                   s))

    sum_digwords = sum(counter_digwords.values())
    sum_digits = sum(counter_digits.values())
    total = sum_digwords + sum_digits
    if total == 0:
        yield "digit_words_19", 0
    else:
        yield "digit_words_19", sum_digwords / total_words
        yield "digits_19", sum_digits / total_words

        for key in counter_digwords.iterkeys():
            yield "digit_words_19_" + key, counter_digwords[key] / total
        for key in counter_digits.iterkeys():
            yield "digits_19_" + key, counter_digits[key] / total

예제 #3

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def medium_words(sentences):
    """Return medium long word ratio."""
    mediumwords = []
    total_words = 0
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        mediumwords += filter(lambda word: len(word) in range(3, 8), s)
        total_words += len(s)
    return len(mediumwords) / total_words

예제 #4

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def long_words(sentences):
    """Return long word ratio."""
    longwords = []
    total_words = 0
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        longwords += filter(lambda word: len(word) > 7, s)
        total_words += len(s)
    return len(longwords) / total_words

예제 #5

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def short_words(sentences):
    """Return short word ratio."""
    shortwords = []
    total_words = 0
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        shortwords += filter(lambda word: len(word) <= 3, s)
        total_words += len(s)
    return len(shortwords) / total_words

예제 #6

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def apostrophes(sentences):
    """Return ratio of words which contain apostrophes / total words."""
    words_with_apos = []
    total_words = 0
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        words_with_apos += filter(lambda word: "'" in word, s)
        total_words += len(s)
    return len(words_with_apos) / total_words

예제 #7

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def voc_richness(sentences):
    """Return ratio different words / total words."""
    counter = Counter()
    stemmer = PorterStemmer()

    for s in sentences:
        s = f_utils.remove_punctuation(s)
        s = f_utils.remove_digits(s)
        counter += Counter((map(lambda word: stemmer.stem(word).lower(), s)))
    return len(counter) / sum(counter.values())

예제 #8

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def mean_word_len(sentences):
    """Return mean word length."""
    wordlen = []
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        wordlen += map(lambda word: len(word), s)

    if len(wordlen) == 0:
        return 0
    return sum(wordlen) / len(wordlen)

예제 #9

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def char_freq(sentences):
    """Return character frequencies."""
    counter = Counter()
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        for w in s:
            counter += Counter(w.lower())
    sum_char = sum(counter.values())
    for k in counter.iterkeys():
        yield "char_freq_" + k, counter[k] / sum_char

예제 #10

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def vowels(sentences):
    """Return ratio vowels / total alpha-chars."""
    alphas = []
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        for w in s:
            alphas += filter(lambda char: char.isalpha(), w)
    num_vowels = filter(lambda vow: vow.lower() in "aeiou", alphas)

    if len(alphas) == 0:
        return 0
    return len(num_vowels) / len(alphas)

예제 #11

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def char_bigrams(sentences):
    """Return character bigram frequencies."""
    counter = Counter()
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        for w in s:
            bigrams = nltk.ngrams(w.lower(), 2)
            bigrams = [a + '_' + b for a, b in bigrams]
            counter += Counter(bigrams)
    sum_counter = sum(counter.values())
    for k in counter.iterkeys():
        yield "char_bigram_" + k, counter[k] / sum_counter

예제 #12

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def word_len_freq(sentences):
    """Return relative frequency of 1-20 letter words."""
    counter = Counter()
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        wordlen = map(lambda word: len(word), s)
        words_shorter20 = filter(lambda wordlen: wordlen <= 20, wordlen)
        counter += Counter(map(lambda word: str(word), words_shorter20))

    sum_words = sum(counter.values())
    for key in counter.iterkeys():
        yield "word_len_freq_" + key, counter[key] / sum_words

예제 #13

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def sent_len_4grams(sentences):
    """Return sentence length 4-gram frequecys."""
    sent_len = map(
        lambda sentence: f_utils.tag_sent_len(
            len(f_utils.remove_punctuation(sentence))), sentences)
    fourgram = nltk.util.ngrams(sent_len, 4)
    fourgram = [a + b + c + d for a, b, c, d in fourgram]
    counter = Counter(fourgram)

    sum_counter = sum(counter.values())

    for key in counter.iterkeys():
        yield "sent_len_4gram_" + key, counter[key] / sum_counter

예제 #14

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def sent_len_bigrams(sentences):
    """Return sentence length bigram frequencys."""
    sent_len = map(
        lambda sentence: f_utils.tag_sent_len(
            len(f_utils.remove_punctuation(sentence))), sentences)
    bigrams = nltk.util.ngrams(sent_len, 2)
    bigrams = [a + b for a, b in bigrams]
    counter = Counter(bigrams)

    sum_counter = sum(counter.values())

    for key in counter.iterkeys():
        yield "sent_len_bigram_" + key, counter[key] / sum_counter

예제 #15

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def word_len_bigrams(sentences):
    """Return word length bigram frequencys."""
    counter = Counter()
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        wordlen = map(lambda word: len(word), s)
        bigrams = nltk.util.ngrams(wordlen, 2)
        bigrams = [str(a) + '_' + str(b) for a, b in bigrams]
        counter += Counter(bigrams)

    sum_bigrams = sum(counter.values())
    for key in counter.iterkeys():
        yield "word_len_bigram_" + key, counter[key] / sum_bigrams

예제 #16

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def voc_hapax_legomenon(sentences):
    """Return ratio unique words / total words."""
    counter = Counter()
    stemmer = PorterStemmer()
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        s = f_utils.remove_digits(s)
        counter += Counter(map(lambda word: stemmer.stem(word).lower(), s))

    unique = filter(lambda word: counter[word] == 1, counter)
    sum_counter = sum(counter.values())
    # print "unique {} / total {}".format(len(unique), sum_counter)
    return len(unique) / sum_counter

예제 #17

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def voc_dis_legomenon(sentences):
    """Return ratio words occuring twice / total words."""
    counter = Counter()
    stemmer = PorterStemmer()
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        s = f_utils.remove_digits(s)
        counter += Counter(map(lambda word: stemmer.stem(word).lower(), s))

    twice = filter(lambda word: counter[word] == 2, counter)
    sum_counter = sum(counter.values())
    # print "twice {} / total {}".format(len(twice), sum_counter)
    return len(twice) / sum_counter

예제 #18

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def char_4grams(sentences):
    """Return character 4-gram frequencies."""
    counter = Counter()
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        for w in s:
            fourgram = nltk.ngrams(w.lower(), 4)
            fourgram = [
                a + '_' + b + '_' + c + '_' + d for a, b, c, d in fourgram
            ]
            counter += Counter(fourgram)
    sum_counter = sum(counter.values())
    for k in counter.iterkeys():
        yield "char_4gram_" + k, counter[k] / sum_counter

예제 #19

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def ordinals(sentences):
    """Return ratio ordinals in form dd+['th', 'st', 'nd', 'rd'] / total words."""
    counter = Counter()
    total_words = 0
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        total_words += len(s)
        counter += Counter(filter(lambda word: f_utils.is_ordinal(word), s))

    sum_ordinals = sum(counter.values())
    if sum_ordinals == 0:
        return 0
    else:
        return sum_ordinals / total_words

예제 #20

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def char_trigrams(sentences):
    """Return character trigram frequencies."""
    counter = Counter()
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        for w in s:
            if len(w) <= 3 or not w.isalpha():
                continue
            trigram = nltk.ngrams(w.lower(), 3)
            trigram = [a + '_' + b + '_' + c for a, b, c in trigram]
            counter += Counter(trigram)
    sum_counter = sum(counter.values())
    for k in counter.iterkeys():
        yield "char_trigram_" + k, counter[k] / sum_counter

예제 #21

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def percent(sentences):
    """Return ratio of word 'percent' & '%' / total words."""
    counter = Counter()
    total_words = 0
    for s in sentences:
        counter += Counter(
            filter(lambda word: word.lower() == "percent" or word == "%", s))
        s = f_utils.remove_punctuation(s)
        total_words += len(s)
    sum_counter = sum(counter.values())
    if sum_counter == 0:
        yield "percent", 0
    else:
        for key in counter.iterkeys():
            yield key, counter[key] / total_words

예제 #22

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def fraction_words(sentences):
    """Return ratio fraction words / total words."""
    counter = Counter()
    total_words = 0
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        total_words += len(s)
        counter += Counter(
            filter(lambda word: f_utils.endswith_fractionword(word), s))

    sum_fraction_words = sum(counter.values())
    if sum_fraction_words == 0:
        return 0
    else:
        return sum_fraction_words / total_words

예제 #23

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def multiplicat_num(sentences):
    """Return ratio of multiplicative numbers (once, twice, thrice) / total words."""
    counter = Counter()
    total_words = 0
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        total_words += len(s)
        counter += Counter(
            filter(lambda word: word.lower() in _MULTIPLICAT_NUM, s))

    sum_counter = sum(counter.values())
    if sum_counter == 0:
        yield "times_" + "once", 0
    else:
        for key in counter.iterkeys():
            yield "times_" + key, counter[key] / total_words

예제 #24

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def word_len_4grams(sentences):
    """Return word length 4-gram frequecys."""
    counter = Counter()
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        wordlen = map(lambda word: len(word), s)
        fourgram = nltk.util.ngrams(wordlen, 4)
        fourgram = [
            str(a) + '_' + str(b) + '_' + str(c) + '_' + str(d)
            for a, b, c, d in fourgram
        ]
        counter += Counter(fourgram)

    sum_4grams = sum(counter.values())
    for key in counter.iterkeys():
        yield "word_len_4gram_" + key, counter[key] / sum_4grams

예제 #25

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def freq_consonants(sentences):
    """Return ratio of most frequent consonant groups / total alpha-chars."""
    alphas = []
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        for w in s:
            alphas += filter(lambda char: char.isalpha(), w)
    cons_tnsrh = filter(lambda cons: cons.lower() in "tnsrh", alphas)
    cons_ldcpf = filter(lambda cons: cons.lower() in "ldcpf", alphas)
    cons_mwybg = filter(lambda cons: cons.lower() in "mwybg", alphas)
    cons_jkqvxz = filter(lambda cons: cons.lower() in "jkqvxz", alphas)

    yield "freq_cons_tnsrh", len(cons_tnsrh) / len(alphas)
    yield "freq_cons_ldcpf", len(cons_ldcpf) / len(alphas)
    yield "freq_cons_mwybg", len(cons_mwybg) / len(alphas)
    yield "freq_cons_jkqvxz", len(cons_jkqvxz) / len(alphas)

예제 #26

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def auxiliary_verbs(sentences):
    """Return ratio auxiliary verb / all auxiliary verbs 
        
    + ratio all auxiliary verbs / total words.
    """
    total_words = 0
    counter = Counter()
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        total_words += len(s)
        for key in _VERBS_AUX.iterkeys():
            counter += Counter(
                {key: len(filter(lambda word: word in _VERBS_AUX[key], s))})
    sum_counter = sum(counter.values())
    for key in counter.iterkeys():
        yield "auxverb_" + key, counter[key] / sum_counter
    yield "auxiliary_verbs", sum_counter / total_words

예제 #27

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def num_times(sentences):
    """Return ratio of one/two/three time(s) / total words."""
    counter = Counter()
    total_words = 0
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        total_words += len(s)
        s = map(lambda word: word.lower(), s)
        bigrams = nltk.ngrams(s, 2)
        counter += Counter(
            filter(lambda bigram: bigram in _NUMTIMES_BIGRAMS, bigrams))

    sum_counter = sum(counter.values())
    if sum_counter == 0:
        yield "times_" + "one_time", 0
    else:
        for key in counter.iterkeys():
            yield "times_" + key[0] + "_" + key[1], counter[key] / total_words

예제 #28

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def voc_yule(sentences):
    """Return modified yule's I measure."""
    stemmer = PorterStemmer()
    counter = Counter()
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        s = f_utils.remove_digits(s)
        counter += Counter(map(lambda word: stemmer.stem(word).lower(), s))

    M1 = float(len(counter))
    # print "M1 {}".format(M1)
    M2 = sum(
        [len(list(g)) * (freq**2) for freq, g in groupby(counter.values())])

    try:
        return ((M1 * M2) / (M2 - M1)) / sum(counter.values())
    except ZeroDivisionError:
        return 0.0

예제 #29

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def voc_bottom10(sentences):
    """Return ratio least frequent words (bottom 10%) / total words."""
    counter = Counter()
    stemmer = PorterStemmer()
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        s = f_utils.remove_digits(s)
        counter += Counter(map(lambda word: stemmer.stem(word).lower(), s))

    # least common 10% of occuring words
    bottom_10 = max(int(round(len(counter) * 0.1)), 1)
    least_common_10 = counter.most_common()[:-bottom_10 - 1:-1]
    # sum_least_common = sum(least_common_10.values())
    # print "least_common_10"
    # print least_common_10
    sum_least_common = sum([lc[1] for lc in least_common_10])

    sum_counter = sum(counter.values())
    # print "least common {} / total {}".format(sum_least_common, sum_counter)
    return sum_least_common / sum_counter

예제 #30

0

파일 보기

파일: features.py 프로젝트: BovineJoni/StylometricClustering

def primary_verbs(sentences):
    """Return ratio primary verb / all primary verbs

    + ratio all primary verbs / total words.
    """
    total_words = 0
    counter = Counter()
    for s in sentences:
        s = f_utils.remove_punctuation(s)
        total_words += len(s)
        counter += Counter(
            {"verb_tobe": len(filter(lambda word: word in _VERBS_TOBE, s))})
        counter += Counter(
            {"verb_todo": len(filter(lambda word: word in _VERBS_TODO, s))})
        counter += Counter({
            "verb_tohave":
            len(filter(lambda word: word in _VERBS_TOHAVE, s))
        })
    sum_counter = sum(counter.values())
    for key in counter.iterkeys():
        yield key, counter[key] / sum_counter
    yield "primary_verbs", sum_counter / total_words