def processRow(row): result = dict(zip( ["id", "set", "essay", "rate1", "rate2", "grade",], row)) sys.stdout.write("\r %s#%s" % (row[1], row[0])) sys.stdout.flush() text_asis = row[2].decode('mac-roman') text = row[2].strip().decode('mac-roman').lower() result["num_chars"] = len(text) sents = nltk.sent_tokenize(text) num_sents = len(sents) result["num_sents"] = num_sents words_in_sentances = [nltk.word_tokenize(sentance) for sentance in sents] words = [] for sent in words_in_sentances: for word in sent: if word not in PUNCTUATION and not all(char in PUNCTUATION for char in word): words.append(word) num_words = len(words) result["num_words"] = num_words result["sentance_length"] = num_words / float(num_sents) num_correctly_spelled = 0 for word in words: try: if enchantDict.check(word) or NER_re.match(word) or word in CONTRACTIONS or word in SPECIAL_WORDS: num_correctly_spelled += 1 # else: # print word.encode('utf-8') except enchant.errors.Error: print "can't spell check", word result["num_correctly_spelled"] = num_correctly_spelled num_syl = 0 for word in words: num_syl += syl.SyllableCount(word) result["num_syl"] = num_syl fk_grade_level = (0.39 * (num_words / num_sents)) \ + (11.8 * (num_syl / num_words)) - 15.59 result["fk_grade_level"] = fk_grade_level if words[0] == 'dear': result["starts_with_dear"] = 1 else: result["starts_with_dear"] = 0 result["distinct_words"] = len(set(words)) #Part of Speech tagging tagged_sentences = [nltk.pos_tag(sent) for sent in words_in_sentances] for pos in pos_dict.keys() result["pos_%s" % pos] = 0 for word, pos in itertools.chain(*tagged_sentences): if pos in pos_dict.keys(): result["pos_%s" % pos] += 1 pos_cnt_all[pos] += 1 #flag ending in a preposition result["end_with_preposition"] = 0 for sent in tagged_sentences: try: if sent[-2][1] == "IN": result["end_with_preposition"] += 1 except: pass #these lines are too clever #try to sum up the counts in the result table for each of these parts of speech to get combos result["num_nouns"] = sum(result.get("pos_%s" % key, 0) for key in ("NN", "NNP", "NNS")) result["num_verbs"] = sum(result.get("pos_%s" % key, 0) for key in ("VB", "VBD", "VBG", "VBN", "VBP", "VBZ")) result["num_adjectives"] = sum(result.get("pos_%s" % key, 0) for key in ("JJ", "JJR", "JJS")) result["num_adverbs"] = sum(result.get("pos_%s" % key, 0) for key in ("RB", "RBR", "RBS")) result["num_superlatives"] = sum(result.get("pos_%s" % key, 0) for key in ("JJS", "RBS")) n_proper_quotes = len(proper_quote_re.findall(text_asis)) n_bad_quotes = len(bad_quote_re.findall(text_asis)) if n_proper_quotes > n_bad_quotes: result["proper_quote_punc"] = 1 elif n_proper_quotes < n_bad_quotes: result["proper_quote_punc"] = -1 else: result["proper_quote_punc"] = 0 result["has_comma"] = 1 if "," in text else 0 result["has_semicolon"] = 1 if ";" in text else 0 result["has_questionmark"] = 1 if "?" in text else 0 result["has_exclamation"] = 1 if "!" in text else 0 result["num_quotes"] = len([char for char in text_asis if char in u'"\u201c\u201d']) #frequencies of NER for ner in NERs: matches = re.findall(r"@%s\d+\b" % ner.upper(), text_asis) result["ner_%s" % ner] = len(matches) # print text # print sents # print words_in_sentances # print words # print tagged_sentences return result
SPECIAL_WORDS.update(websites) SPECIAL_WORDS.update(x + ".com" for x in websites) SPECIAL_WORDS.update("www." + x + ".com" for x in websites) NER_re = re.compile(r"""(?:organization|caps|date|percent|person|money|location|num|month|time)\d+$""") NERs = ["person", "organization", "location", "date", "time", "money", "percent", "caps", "num", "month"] keys = ["id", "set", "essay", "rate1", "rate2", "grade", "num_chars", "num_sents", "num_words", "num_syl", "sentance_length", "num_correctly_spelled", "fk_grade_level", "starts_with_dear", "distinct_words", "end_with_preposition", "num_nouns", "num_verbs", "num_adjectives", "num_adverbs", "num_superlatives", "has_comma", "has_semicolon", "has_questionmark", "has_exclamation", "num_quotes", "proper_quote_punc"] keys.extend("ner_%s" % x for x in NERs) keys.extend("pos_%s" % x for x in sorted(pos_dict.keys())) def processRow(row): result = dict(zip( ["id", "set", "essay", "rate1", "rate2", "grade",], row)) sys.stdout.write("\r %s#%s" % (row[1], row[0])) sys.stdout.flush() text_asis = row[2].decode('mac-roman') text = row[2].strip().decode('mac-roman').lower() result["num_chars"] = len(text) sents = nltk.sent_tokenize(text) num_sents = len(sents)