def getAffectRatio(self, filename, text): # this could be provided as a feature, too self.wordsAffect = liwc()['Affect'] count = 0 for regex in self.wordsAffect: count += len(re.findall(regex + r"\b", text)) poemFeatures["affectRatio"] = count * 1.0 / len(getWords(text))
from nltk import corpus from math import log from random import sample MIN_COMMENT_NUM = 10 COMMENT_DIR = "../data/comments_old/" AFFECT_RATIO_DICT = "affect_ratio.p" AFFECT_RATIO_PER_COMMENT_DICT = "affect_ratio2.p" NRC_RATIO_DICT = "nrc_ratio.p" NRC_FILE = '../data/NRC-lexicon.txt' IGNORE_FILES = ["039", # someone added wikipedia articles as comments "411", "447","466" # lots of loves ] stopwordList = corpus.stopwords.words('english') affectWordList = liwc()['Affect'] def makeRegexFromList(l): result = r"\b" + r"\b|\b".join(l) + r"\b" return re.sub("\.", "[a-z]", result) def removeStopwords(text): stopwordRegex = makeRegexFromList(stopwordList) return re.sub(stopwordRegex, "", text) def getWords(text): text = ''.join(text).strip() return re.findall("[\w']+", removeStopwords(text)) def getCommentFilenames(): return [(f, COMMENT_DIR + f) for f in listdir(COMMENT_DIR) if isfile(COMMENT_DIR + f)]