示例#1
0
def ave_vs(s):
    sum = 0.0
    total = 0.0
    words = str([w for w, t in s])
    for v in util.get_vps(s):
        sum += vs(words, str([w for w, t in v]))
        total += 1
    if total > 0.0:
        return sum / total
    return 0.0
示例#2
0
def compute_features(s):
    feats = []
    chunked = nltk.tokenize.word_tokenize(s)
    tagged = nltk.pos_tag(chunked)
    generic = list(common.generify(tagged))

    #"Noun Euphamism" Features

    #Has sexy noun
    feats.append(0.0)
    for w in chunked:
        if w.lower() in common.SN:
            feats[-1] = 1.0
            break

    #Has body part
    feats.append(0.0)
    for w in chunked:
        if w.lower() in common.BP:
            feats[-1] = 1.0
            break

    #TODO: NS(s) = 10^-7

    #Average NS(s) for all nouns in s not in union of SN and BP
    feats.append(ave_ns(tagged))

    #"Structural Element" features

    #Verb that never occurs in Se
    feats.append(0.0)
    for v in [w for w, t in tagged if t[0] == 'V']:
        if v not in sexy_verbs:
            feats[-1] = 1.0
            break

    #Verb phrase that never occurs in Se
    feats.append(0.0)
    for v in util.get_vps(generic):
        if str([w for w, t in v]) not in se_vp_set:
            feats[-1] = 1.0
            break

    #Average VS over all verb phrases in s
    feats.append(ave_vs(generic))

    #Average AS for all adjs in s
    feats.append(ave_as(tagged))

    #Is there an adjective in s that never occurs in Se U Sb with a noun in SN
    feats.append(0.0)
    for a in [w for w, t in tagged if t[0] == 'J']:
        if a not in sexy_adjs:
            feats[-1] = 1.0
            break

    #Basic Structure

    #Punctuation tokens
    feats.append(sum([1 for w, t in tagged if t != '.' and t != ':' and t !=',']))

    #Non-punctuation tokens (Note: this must stay directly after the punctuation tokens feature)
    feats.append(len(chunked) - feats[-1])

    #TODO: Number of times each pronoun and each (~~approximately) POS appears in s
    feats.append(sum([1 for w, t in tagged if t[0] == 'D'])) #determiners
    feats.append(sum([1 for w, t in tagged if t[0] == 'N'])) #nouns
    feats.append(sum([1 for w, t in tagged if t[0] == 'P'])) #pronouns
    feats.append(sum([1 for w, t in tagged if t[0] == 'V'])) #verbs
    feats.append(sum([1 for w, t in tagged if t[0] == 'J'])) #adjectives
    feats.append(sum([1 for w, t in tagged if t[0] == 'I'])) #prepositions
    
    #Subject
#    nouns = [w for w, t in tagged if t[0] == 'N' or t == 'PRP']
#    if len(nouns) > 0:
#        subj = nouns[0]
#    else:
#        subj = ""
#    feats.append(subj)

    return dict((n, f) for n, f in enumerate(feats))
示例#3
0
#print "calculating fdist 2"
#s_in_se = Placeholder(se_words)
#print "calculating fdist 3"
#v_in_s = Placeholder(se_vp_words + sb_vp_words)
#
#def vs(s, v):
#    return math.pow(2, v_in_se.phrase_prob(v) + s_in_se.phrase_prob(s) - v_in_s.phrase_prob(v))

def ave_vs(s):
    sum = 0.0
    total = 0.0
    words = str([w for w, t in s])
    for v in util.get_vps(s):
        sum += vs(words, str([w for w, t in v]))
        total += 1
    if total > 0.0:
        return sum / total
    return 0.0

se_vps = [str([w for w, t in vp]) for s in erotic.generic_tagged_sents() for vp in util.get_vps(s)]
sb_vps = [str([w for w, t in vp]) for s in brown.generic_tagged_sents() for vp in util.get_vps(s)]
se_sents = [str([w for w, t in s]) for s in erotic.generic_tagged_sents()]
se_vp_set = set(se_vps)

v_in_se = nltk.LaplaceProbDist(nltk.FreqDist(se_vps), len(set(se_vps)))
s_in_se = nltk.LaplaceProbDist(nltk.FreqDist(se_sents), len(set(se_sents)))
v_in_s = nltk.LaplaceProbDist(nltk.FreqDist(se_vps + sb_vps), len(set(se_vps + sb_vps)))

def vs(s, v):
    return (v_in_se.prob(v) * s_in_se.prob(s)) / v_in_s.prob(v)