def add_comment(self, comment): index = len(self.comments) complete_sentiment = self.sentiment_score * index emoticon_sentiment = sentiment.get_emoticon_score( comment, self.emoticonlist) if emoticon_sentiment > 0: complete_sentiment = complete_sentiment + ( emoticon_sentiment + sentiment.get_sentiment_score(comment)) / 2.0 else: complete_sentiment = complete_sentiment + sentiment.get_sentiment_score( comment) self.sentiment_score = float(complete_sentiment) / float(index + 1) # comment = nlp.preprocess(comment) self.comments.append(comment) term_vector = list(set(metrics.get_term_vector(comment))) self.comment_term_vectors.append(term_vector) # terms = [] # for i in range(1,4): # # for 1,2,3 # iterms = nlp.extract_ngrams(comment, i) # terms = terms + iterms # print terms for term in term_vector: # cleaned_term = nlp.remove_stopwords(term) # if len(cleaned_term) == 0: # continue if term not in self.terms.keys(): self.terms[term] = set() self.center[term] = 0 self.terms[term].add(comment) self.center[term] += 1
def create_reddit_post_dictionary(submission, subreddit, sentiment_dict): title_and_text = submission.title + " " + submission.selftext return dict(id=submission.id, source='reddit', sub_source=subreddit.display_name, content_type='post', author=submission.author.name if submission.author is not None else None, title=submission.title, text=submission.selftext, title_sentiment=get_sentiment_score(submission.title, sentiment_dict), text_sentiment=get_sentiment_score(submission.selftext, sentiment_dict), subjects=get_subjects(title_and_text), source_score=submission.score, upvote_ratio=submission.upvote_ratio, num_comments=submission.num_comments, parent=None, publish_date=datetime.utcfromtimestamp(submission.created_utc))
def remove_comment(self, comment, index): if comment not in self.comments: print "Comment does not exist in cluster" return # index = self.comments.index(comment) # self.comments.remove(comment) emoticon_sentiment = sentiment.get_emoticon_score( comment, self.emoticonlist) complete_sentiment = self.sentiment_score * index if emoticon_sentiment > 0: complete_sentiment = complete_sentiment - ( emoticon_sentiment + sentiment.get_sentiment_score(comment)) / 2.0 else: complete_sentiment = complete_sentiment - sentiment.get_sentiment_score( comment) try: self.sentiment_score = float(complete_sentiment) / float(index - 1) except: self.sentiment_score = 0 del self.comments[index] tv = self.comment_term_vectors[index] # print tv # print list(clusters).index(self) for term in tv: try: # print term self.terms[term].remove(comment) self.center[term] -= 1 except (ValueError, KeyError): pass del self.comment_term_vectors[index]
def toTxt(filename): h5File = open_h5_file_read(filename) n = get_num_songs(h5File) didWrite = False with open(filename + '.txt', 'w+') as out: for i in range(0, n): isValid = True song = {} # grab the scalar fields from MSD for field in existingFieldnames: data = globals()['get_' + field](h5File, i) song[field] = data #run nlp song['sentiment_score'] = get_sentiment_score( get_track_id(h5File, i)) if song['sentiment_score'] != 0 and song[ 'sentiment_score'] != 1: isValid = False # this means we don't have lyrics song['segments_timbre'] = get_segments_timbre(h5File, i).tolist() if isValid: print 'Writing valid song' txt = encode_song(song) out.write(txt + '\n') didWrite = True else: print "Missing lyrics. Skipping" # save output file h5File.close() out.close() #delete the hdf5 file os.remove(filename) if not didWrite: os.remove(filename + '.txt')