Exemplo n.º 1
0
def processRow(row):
    result = dict(zip(
        ["id", "set", "essay", "rate1", "rate2", "grade",],
        row))
    sys.stdout.write("\r %s#%s" % (row[1], row[0]))
    sys.stdout.flush()

    text_asis = row[2].decode('mac-roman')
    text = row[2].strip().decode('mac-roman').lower()

    result["num_chars"] = len(text)

    sents = nltk.sent_tokenize(text)
    num_sents = len(sents)
    result["num_sents"] = num_sents

    words_in_sentances = [nltk.word_tokenize(sentance) for sentance in sents]
    words = []
    for sent in words_in_sentances:
        for word in sent:
            if word not in PUNCTUATION and not all(char in PUNCTUATION for char in word):
                words.append(word)
    num_words = len(words)
    result["num_words"] = num_words

    result["sentance_length"] = num_words / float(num_sents)


    num_correctly_spelled = 0
    for word in words:
        try:
            if enchantDict.check(word) or NER_re.match(word) or word in CONTRACTIONS or word in SPECIAL_WORDS:
                num_correctly_spelled += 1
            # else:
            #     print word.encode('utf-8')
        except enchant.errors.Error:
            print "can't spell check", word
    result["num_correctly_spelled"] = num_correctly_spelled


    num_syl = 0
    for word in words:
        num_syl += syl.SyllableCount(word)
    result["num_syl"] = num_syl

    fk_grade_level = (0.39 * (num_words / num_sents)) \
        + (11.8 * (num_syl / num_words)) - 15.59
    result["fk_grade_level"] = fk_grade_level

    if words[0] == 'dear':
        result["starts_with_dear"] = 1
    else:
        result["starts_with_dear"] = 0

    result["distinct_words"] = len(set(words))

    #Part of Speech tagging
    tagged_sentences = [nltk.pos_tag(sent) for sent in words_in_sentances]

    for pos in pos_dict.keys()
        result["pos_%s" % pos] = 0
    for word, pos in itertools.chain(*tagged_sentences):
        if pos in pos_dict.keys():
            result["pos_%s" % pos] += 1
        pos_cnt_all[pos] += 1

    #flag ending in a preposition
    result["end_with_preposition"] = 0
    for sent in tagged_sentences:
        try:
            if sent[-2][1] == "IN":
                result["end_with_preposition"] += 1
        except:
            pass

    #these lines are too clever
    #try to sum up the counts in the result table for each of these parts of speech to get combos
    result["num_nouns"] = sum(result.get("pos_%s" % key, 0) for key in ("NN", "NNP", "NNS"))
    result["num_verbs"] = sum(result.get("pos_%s" % key, 0) for key in ("VB", "VBD", "VBG", "VBN", "VBP", "VBZ"))
    result["num_adjectives"] = sum(result.get("pos_%s" % key, 0) for key in ("JJ", "JJR", "JJS"))
    result["num_adverbs"] = sum(result.get("pos_%s" % key, 0) for key in ("RB", "RBR", "RBS"))
    result["num_superlatives"] = sum(result.get("pos_%s" % key, 0) for key in ("JJS", "RBS"))


    n_proper_quotes = len(proper_quote_re.findall(text_asis))
    n_bad_quotes = len(bad_quote_re.findall(text_asis))
    if n_proper_quotes > n_bad_quotes:
        result["proper_quote_punc"] = 1
    elif n_proper_quotes < n_bad_quotes:
        result["proper_quote_punc"] = -1
    else:
        result["proper_quote_punc"] = 0
    result["has_comma"] = 1 if "," in text else 0
    result["has_semicolon"] = 1 if ";" in text else 0
    result["has_questionmark"] = 1 if "?" in text else 0
    result["has_exclamation"] = 1 if "!" in text else 0
    result["num_quotes"] = len([char for char in text_asis if char in u'"\u201c\u201d'])

    #frequencies of NER
    for ner in NERs:
        matches = re.findall(r"@%s\d+\b" % ner.upper(), text_asis)
        result["ner_%s" % ner] = len(matches)


    # print text
    # print sents
    # print words_in_sentances
    # print words
    # print tagged_sentences

    return result
Exemplo n.º 2
0
SPECIAL_WORDS.update(websites)
SPECIAL_WORDS.update(x + ".com" for x in websites)
SPECIAL_WORDS.update("www." + x + ".com" for x in websites)

NER_re = re.compile(r"""(?:organization|caps|date|percent|person|money|location|num|month|time)\d+$""")
NERs = ["person", "organization", "location", "date", "time", "money", "percent", "caps", "num", "month"]



keys = ["id", "set", "essay", "rate1", "rate2", "grade",
    "num_chars", "num_sents", "num_words", "num_syl", "sentance_length", "num_correctly_spelled", "fk_grade_level",
    "starts_with_dear", "distinct_words", "end_with_preposition",
    "num_nouns", "num_verbs", "num_adjectives", "num_adverbs", "num_superlatives",
    "has_comma", "has_semicolon", "has_questionmark", "has_exclamation", "num_quotes", "proper_quote_punc"]
keys.extend("ner_%s" % x for x in NERs)
keys.extend("pos_%s" % x for x in sorted(pos_dict.keys()))

def processRow(row):
    result = dict(zip(
        ["id", "set", "essay", "rate1", "rate2", "grade",],
        row))
    sys.stdout.write("\r %s#%s" % (row[1], row[0]))
    sys.stdout.flush()

    text_asis = row[2].decode('mac-roman')
    text = row[2].strip().decode('mac-roman').lower()

    result["num_chars"] = len(text)

    sents = nltk.sent_tokenize(text)
    num_sents = len(sents)