def extract_bigrams(articleList, commentCount): featureMatrix = np.zeros([commentCount,100]) index = 0 stemmer = SnowballStemmer("english", ignore_stopwords=True) bagOfWords = [] for art in articleList.items(): for comm in art[1]: mywords = words(comm.body) mywords = known_words(mywords) # Remove Stops filtered_words = [w for w in mywords if not w in stopwords.words('english')] # Stemming stemmed_words = [stemmer.stem(w) for w in filtered_words] bagOfWords += stemmed_words bagOfWords.append("\n") tempVector = dict() #Create your bigrams bgs = nltk.bigrams(bagOfWords) fdist = nltk.FreqDist(bgs) for k in fdist.keys()[:100]: tempVector[k] = 0 theKeys = tempVector.keys() for art in articleList.items(): for comm in art[1]: mywords = words(comm.body) mywords = known_words(mywords) # Remove Stops filtered_words = [w for w in mywords if not w in stopwords.words('english')] # Stemming stemmed_words = [stemmer.stem(w) for w in filtered_words] bgs = nltk.bigrams(stemmed_words) for word in (w for w in bgs if tempVector.has_key(w)): keyInd = theKeys.index(word) featureMatrix[index][keyInd] += 1 index += 1 if index % 100 == 0: print "extracted", index, "features" if index >= commentCount: break print "non-zero",np.count_nonzero(featureMatrix) print "Percentage filled:%.2f" %(float(np.count_nonzero(featureMatrix))/(featureMatrix.shape[0]*featureMatrix.shape[1])) return featureMatrix
def read_news24_comments(filename, skip=True, skip_mtn=False, limit = -1): # Short list of comments values = defaultdict(list) headers_news24 = ['article_id', 'comment_id','thread_root_id', 'user_id', 'likes','dislikes','reported','status','rating','date','author','article_title','article_body','comment_content','lemma_body','pos_body'] f1 = open(filename, 'r') commentCount = 0 totalCount = 0 lessThanCount = 0 mtnCount = 0 for line in f1: temp = line.split('&') body = temp[13].lower() if skip: if len(words(body)) < WORD_MIN: lessThanCount += 1 continue if skip_mtn: if "mtn" in body or "honda" in body or "toyota" in body or "form" in body or "camry" in body or "service" in body : mtnCount += 1 continue commentCount += 1 if commentCount % 10000 == 0: print "Read", commentCount, "comments" for i,v in enumerate(temp): values[headers_news24[i]].append(v) df_news24_large = pd.DataFrame(values) # Replace date with datetime def map_date(date): date_ret = strptime(date, "%Y-%m-%d %H:%M:%S.%f") return date_ret df_news24_large.date = df_news24_large.date.map(map_date) # Add root for null roots cond = df_news24_large.thread_root_id == 'null' df_news24_large.loc[cond, 'thread_root_id'] = df_news24_large['comment_id'] print df_news24_large[cond].shape print "Saved",commentCount,"comments out of", totalCount print lessThanCount, "comments less than", WORD_MIN print mtnCount, "mtn comments" return df_news24_large
def extract_sentence_values(articleList, commentList, parentList, commentCount): valueVector = np.empty([commentCount,4]) index = 0 for commList in commentList.values(): sumVotes = 0 for comm in commList: sumVotes += comm.likeCount + comm.dislikeCount for comm in commList: sentences = nltk.sent_tokenize(comm.lemma_body) for sent in sentences: tokens = nltk.regexp_tokenize(sent, pattern) theWords = words(comm.body) uniqueWords = set(theWords) if len(tokens) == 0 or len(uniqueWords) == 0: continue ratio = (comm.likeCount) / (float(max(1,comm.likeCount+comm.dislikeCount))) #print ttest totalVotes = comm.likeCount + comm.dislikeCount valueVector[index,0] = totalVotes valueVector[index,1] = ratio if comm.reported > 0: valueVector[index,2] = 1 else: valueVector[index,2] = 0 if comm.status == 1: valueVector[index,3] = 0 else: valueVector[index,3] = 1 index += 1 if index % 1000 == 0: print "extracted", index, "values" if index >= commentCount: break if index >= commentCount: break return valueVector
def read_slashdot_comments(filename, skip=True): values = defaultdict(list) headers = ['article_id', 'comment_id', 'thread_root_id', 'parent_id', 'author', 'score', 'flag', 'date', 'wtf', 'article_title', 'article_body', 'comment_title', 'has_link', 'comment_content', 'quoted_text'] skippedCount = 0 commentCount = 0 f1 = open(filename, 'r') for line in f1: temp = line.split('\t') if len(temp) < 14: continue if len(words(temp[13])) == 0: continue for i, v in enumerate(temp): values[headers[i]].append(v) # Create Dataframe df_slashdot = pd.DataFrame(values) df_slashdot.drop('wtf', axis=1, inplace=True) # Decode Strings for col in df_slashdot.columns: df_slashdot[col] = df_slashdot[col].str.decode('iso-8859-1').str.encode('utf-8') # Add root for null roots cond = df_slashdot.thread_root_id == 'NULL' df_slashdot.loc[cond, 'thread_root_id'] = df_slashdot['comment_id'] print df_slashdot[df_slashdot.thread_root_id == 'NULL'].shape # Replace date with datetime def map_date(date): date_ret = None try: date_ret = strptime(date, "<> on %A %B %d, %Y @%H:%M%p ()") except: date_ret = strptime(date, "on %A %B %d, %Y @%H:%M%p ()") return date_ret df_slashdot.date = df_slashdot.date.map(map_date) if (skip): df_slashdot = df_slashdot[df_slashdot['author'].str.lower() != 'anonymous coward'] df_slashdot = df_slashdot[df_slashdot['score'] != '2'] print "Done with comments" return df_slashdot
def extract_feature_matrix(df_comments, df_thread_groupby): print "START" # Sentence Tokenizer sentencer = SentenceTokenizer() clf = load_classifier(sentiment_path + 'sentiment_classifier.pickle') featureMatrix = np.empty([df_comments.shape[0],25]) feature_dict = dict() for ix, row in df_comments.iterrows(): feature_dict[row['comment_id']] = ix feature_count = 0 for _,row in df_comments.iterrows(): index = feature_dict[row['comment_id']] comm = row['comment_content'].decode('ASCII', 'ignore') tokens = words(comm) unique_tokens = set(tokens) sentences = sentencer.tokenize(comm) featureMatrix[index][3] = len(comm) verb_fr, noun_fr, pronoun_fr = pos_freq(tokens) featureMatrix[index][4] = verb_fr featureMatrix[index][5] = noun_fr featureMatrix[index][6] = pronoun_fr featureMatrix[index][7] = capital_frequency(tokens) featureMatrix[index][8] = sent_frequency(sentences, '?') featureMatrix[index][9] = sent_frequency(sentences, '!') featureMatrix[index][10] = sentence_capital_frequency(sentences) featureMatrix[index][11] = entropy(comm) featureMatrix[index][12] = lexical_diversity(tokens) if len(tokens) == 0: featureMatrix[index][13] = 0 featureMatrix[index][14] = 0 featureMatrix[index][15] = 0 featureMatrix[index][16] = 0 else: spelt_wrong = missing_words(unique_tokens) bad_words_list = swears(unique_tokens) featureMatrix[index][13] = len(spelt_wrong) featureMatrix[index][14] = len(spelt_wrong)/float(len(unique_tokens)) featureMatrix[index][15] = len(bad_words_list) featureMatrix[index][16] = len(bad_words_list)/float(len(unique_tokens)) featureMatrix[index][19] = F_K_score(sentences, tokens) testSet = dict() refWords = make_full_dict(tokens) testSet.update(refWords) probDist = clf.prob_classify(testSet) sentiment = probDist.prob('pos') subj_obj = get_subjectivity(probDist) polarity_overlap = get_polarity_overlap(words(row['article_body']), tokens, clf) featureMatrix[index][22] = sentiment featureMatrix[index][23] = subj_obj featureMatrix[index][24] = polarity_overlap feature_count += 1 if feature_count % 1000 == 0: print feature_count print "DONE" feature_count = 0 # Grouped for _,group in df_thread_groupby: thread_comments = [row['comment_content'] for _,row in group.iterrows()] # Get average time sumTime = 0 count = 0 previous = mktime(group.iloc[0]['date']) first = mktime(group.iloc[0]['date']) # Average length sumLen = 0 thread_tokens = [] # Within Thread for _, row in group.iterrows(): index = feature_dict[row['comment_id']] comm = row['comment_content'].decode('ascii','ignore') tokens = words(comm) sentences = sentencer.tokenize(comm) # Ongoing average time sumTime += mktime(row['date']) - previous count += 1 avgTime = sumTime/float(count) # Ongoing average length sumLen += len(words(row['comment_content'])) avgLen = sumLen/float(count) ###################################################################### # Get chunked sentences for sent in sentences: sent_tokens = words(sent) sent_tokens_tagged = nltk.pos_tag(sent_tokens) chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True) doc = [] for chunk in chunks: if type(chunk) == nltk.Tree: doc.append(' '.join(c[0] for c in chunk.leaves())) else: doc.append(chunk[0]) doc = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1] # The cumulative tokens up to this point thread_tokens += doc ###################################################################### article_tokens = [] article_sentences = sentencer.tokenize(row['article_body']) for sent in article_sentences: sent_tokens = words(sent) sent_tokens_tagged = nltk.pos_tag(sent_tokens) chunks = nltk.ne_chunk(sent_tokens_tagged, binary=True) doc = [] for chunk in chunks: if type(chunk) == nltk.Tree: doc.append(' '.join(c[0] for c in chunk.leaves())) else: doc.append(chunk[0]) article_tokens = [word.strip(string.punctuation) for word in doc if len(word.strip(string.punctuation)) > 1] ###################################################################### featureMatrix[index][0] = timeliness(mktime(row['date']), previous, max(avgTime, 1)) previous = mktime(row['date']) featureMatrix[index][1] = mktime(row['date']) - first featureMatrix[index][2] = lengthiness(words(row['comment_content']), max(avgLen, 1)) featureMatrix[index][17] = np.mean([termf(comm.count(w), tokens) for w in set(tokens)]) featureMatrix[index][18] = tf_idf(comm, thread_comments) featureMatrix[index][20] = onSubForumTopic(tokens, thread_tokens) featureMatrix[index][21] = onSubForumTopic(tokens, article_tokens) feature_count += 1 if feature_count % 1000 == 0: print feature_count return featureMatrix