示例#1
0
def emotionWith(name, soup, emotion):
	threads = soup.find_all(attrs={"class":"thread"})
	convo = None
	for t in threads:
		messages = t.children
		members = next(messages)
		members_list = members.split(',')
		#print(members)
		if len(members_list) == 2 and name in members_list[0]:
			break

	f = open('helloworld.html','w')
	html_text = "<html><head></head><body><h1>Message Log</h1>"
	
	# messages = [m for m in messages]	
	# # Eventually I'll want to create a list so that I can 
	# # give context for these messages		

	for header in messages:
		user = header.find(attrs={"class":"user"})
		date_text = header.find(attrs={"class":"meta"}).text
		date_text = date_text + '00'
		#date = datetime.strptime(date_text, "%A, %B %d, %Y at %I:%M%p %Z%z")
		message = next(messages)
		
		
		try:
			if vs(message.string)[emotion] > 0.2:
				print("----------")
				print(user.string)
				print(date_text)
				print(message.string)
				print(vs(message.string))
				print('')
		except:
			pass
示例#2
0
def messagesWith(name,soup=soup):
	
	threads = soup.find_all(attrs={"class":"thread"})
	convo = None
	for t in threads:
		messages = t.children
		members = next(messages)
		members_list = members.split(',')
		#print(members)
		if len(members_list) == 2 and name in members_list[0]:
			break

	filename = 'messagesWith_' + name + '.html'
	f = open(filename,'w')
	html_text = "<html><head></head><body><h1>Message Log</h1>"
				
	for header in messages:
		user = header.find(attrs={"class":"user"})
		date_text = header.find(attrs={"class":"meta"}).text
		date_text = date_text + '00'
		#date = datetime.strptime(date_text, "%A, %B %d, %Y at %I:%M%p %Z%z")
		message = next(messages)

		print(user.string)
		print(date_text)
		print(message.string)
		try:
			print(vs(message.string))
		except:
			print("Couldn't get sentiment")
		print("-------------")

		html_text += "<hr>"
		html_text += p(user.string)
		html_text += p(date_text)
		html_text += p(message)

	
	html_text+="</body></html>"

	f.write(html_text)
	f.close()
示例#3
0
    # Convert time from Tue Mar 29 04:04:22 +0000 2016 to 2016-3-29
    time = document["created_at"].split()
    month = months[time[1]]
    day = time[2]
    year = time[5]
    date = str(year) + "-" + str(month) + "-" + str(day)

    docs.append({"text": document["text"],
                 "date": '"' + date + '"'})

aggregate = {}
count = {}

for doc in docs:
    text = doc["text"].encode('utf-8')
    sentiment = vs(text)
    value = (sentiment['neg'] * -1) + (sentiment['pos'])
    if doc["date"] not in aggregate:
        aggregate[doc["date"]] = value
        count[doc["date"]] = 1
    else:
        aggregate[doc["date"]] += value
        count[doc["date"]] += 1

# normalize
f.write("[ \n")
for date in aggregate:
    aggregate[date] = aggregate[date]/count[date]

    f.write('{ \t "date": ' + str(date) +
            ',\n \t "value": ' + str(aggregate[date]) + "\n }")
示例#4
0
feature_name_pos = []
for k, v in pos_vocab.items():
    feature_name_pos.append(k)

LR_pos = LogisticRegression(class_weight="balanced", penalty="l1",
                            C=0.01).fit(pos,
                                        np.reshape(target, target.shape[0]))
select2 = SelectFromModel(LR_pos, prefit=True)
y = select2.transform(pos)

save_pos = open("pos.pickle", 'wb')
pickle.dump(pos, save_pos)
save_pos.close()
pprint("Pos_vectorizer has been pickled")

sentiment_analyzer = vs()


def more_feats(sent):
    text = cleaner.basic_cleaning(sent)
    sentiment = sentiment_analyzer.polarity_scores(sent)
    syllables = textstat.syllable_count(text)
    avg_syl_per_word = (
        0.001 + float(syllables)) / float(0.001 + len(word_tokenize(text)))
    num_terms = len(sent.split())
    num_words = len(text.split())
    num_unique_words = len(set(text.split()))
    num_char = len(text)
    total_char = len(sent)
    sent = cleaner.preprocessing_stage2(sent)
    urlcount = sent.count("URLHERE")
reviewlist=list()
ratinglist=list()
featureset=set()
featurelist=list()
reviewTextlist=list()
scorelist=list()

counter=0

g = open("./smaller.json", 'r')
for l in g:
  review = eval(l)
  reviewText = review["reviewText"]
  rating = review["overall"]
  score=vs(reviewText)["compound"]#length=len(reviewText)#featurelist.append([score,length])
  scorelist.append(score)
  ratinglist.append(rating)

#fvector=np.array(featurelist)

'''g = open("./smaller.json", 'r')
for l in g:
   	review = eval(l)
   	reviewText = review["reviewText"]
   	rating = review["overall"]
   	score=vs(reviewText)["compound"]
   	s = [lmtzr.lemmatize(i) for i in word_tokenize(reviewText) if i not in stop and i not in string.punctuation and i.isalpha()]
   	featureset=featureset.union(set(s))
featurelist=list(featureset)
reviewlist = list()
ratinglist = list()
featureset = set()
featurelist = list()
reviewTextlist = list()
scorelist = list()

counter = 0

g = open("path/to/test_data", 'r')
for l in g:
    review = eval(l)
    reviewText = review["reviewText"]
    rating = review["overall"]
    score = vs(reviewText)["compound"]
    s = [
        lmtzr.lemmatize(i) for i in word_tokenize(reviewText)
        if i not in stop and i not in string.punctuation and i.isalpha()
    ]
    featureset = featureset.union(set(s))
featurelist = list(featureset)

for l in g:
    review = eval(l)
    reviewText = review["reviewText"]
    rating = review["overall"]
    s = [
        lmtzr.lemmatize(i) for i in word_tokenize(reviewText)
        if i not in stop and i not in string.punctuation and i.isalpha()
    ]
示例#7
0
    start = time.time()
    with open("reviews.csv") as infile:
        with open("reviews_sentiments.csv", "w") as outfile:
            index = -1
            for line in infile:
                index += 1
                if index == 0:
                    outfile.write(line)
                    continue
                if index % 100 == 0:
                    print index / 100,
                
                text_split = line.strip().split(',', 4)
                text = text_split[4].decode('ascii', 'ignore')
                everything_else = ','.join(text_split[:4])
                score = str(vs(text)['compound'])
                new_line = everything_else + "," + score + "," + text + "\n"

                outfile.write(new_line)            
                d[text] = score

                if max_reviews and index >= max_reviews:
                    break

    print
    if max_reviews:
        duration = time.time() - start
        print "%d reviews took %f seconds for an average of %f seconds per review" % (index, duration, duration/index)

        sorted_reviews = sorted(d.items(), key=lambda x: x[1])
        worst = sorted_reviews[:10]
reviewlist=list()
ratinglist=list()
featureset=set()
featurelist=list()
reviewTextlist=list()
scorelist=list()

counter=0


g = open("path/to/test_data", 'r')
for l in g:
   	review = eval(l)
   	reviewText = review["reviewText"]
   	rating = review["overall"]
   	score=vs(reviewText)["compound"]
   	s = [lmtzr.lemmatize(i) for i in word_tokenize(reviewText) if i not in stop and i not in string.punctuation and i.isalpha()]
   	featureset=featureset.union(set(s))
featurelist=list(featureset)

for l in g:
    review = eval(l)
    reviewText = review["reviewText"]
    rating = review["overall"]
    s = [lmtzr.lemmatize(i) for i in word_tokenize(reviewText) if i not in stop and i not in string.punctuation and i.isalpha()]
    slist=[0]*len(featurelist)
    for item in s:
		slist[featurelist.index(item)]+=1
    		#print len(featurelist), len(slist), item
    reviewlist.append(slist)
    ratinglist.append(rating)