Exemplo n.º 1
0
def find_author_id(author_name):
	
	for n in range(len(AllT.collect_tweets())):
		#print(AllT.collect_tweets()[n][0])
		if(author_name == AllT.collect_tweets()[n][0]):
			return n
		else:
			next
Exemplo n.º 2
0
	def get_unigrams_bigrams_count(self):
		all_unigrams = BOW.Get_unigrams_bigrams(AllT.collect_text())[0]  
		all_bigrams = BOW.Get_unigrams_bigrams(AllT.collect_text())[1]  

		vect1 = CountVectorizer(vocabulary=all_unigrams)
		unigrams = vect1.fit_transform(self.tweet).toarray()

		vect2 = CountVectorizer(vocabulary=all_bigrams)
		bigrams = vect2.fit_transform(self.tweet).toarray()

		return (unigrams, bigrams)
Exemplo n.º 3
0
	def __init__(self,  author_name):
		self.tweets = AllT.collect_topic_context()
		self.id = find_author_id(author_name)
		
		# build the data matrix and vocabulary by 
		# tokenizing and cleaning stop-words 
		self.vocab = BOW.Get_BOW(self.tweets)[0]
		self.data =  BOW.Get_BOW(self.tweets)[1]
def original_message_unigram_fature(tweet_id):

	author_tweet = dbtweets.find_one({'tweet_id': tweet_id})
	original_text = author_tweet['in_reply_to_status_id_text']

	if(original_text == None):
		#print 'this tweet was not replying to other tweet'
		return 
	else:
		original_text_unigram = BOW.Get_unigrams(AllT.collect_original_messages(), original_text)
		return original_text_unigram
def original_message_unigram_fature(tweet_id):

    author_tweet = dbtweets.find_one({'tweet_id': tweet_id})
    original_text = author_tweet['in_reply_to_status_id_text']

    if (original_text == None):
        #print 'this tweet was not replying to other tweet'
        return
    else:
        original_text_unigram = BOW.Get_unigrams(
            AllT.collect_original_messages(), original_text)
        return original_text_unigram
def update_profile_unigram():
	all_profiles = AT.collect_profiles()

	# Import countvectorzier to generate unigrams 
	unicount_vect = CountVectorizer(ngram_range=(1,1), lowercase = False,  stop_words='english',  token_pattern=r'\b\w+\b', min_df=1)
	unicount = unicount_vect.fit_transform(all_profiles).toarray() 
	unigrams = unicount_vect.get_feature_names()
	print unicount_vect
	x = nltk.cluster.api.ClusterI()
	y = x.cluster(unicount, assign_clusters=False)
	# Load profile_unigram into MongoDB
	"""
	for n in range(test.find().count()):
		tweetAuthor = test.find()[n]["author"]
		profile_unigram = scipy.sparse.coo_matrix(unicount_vect.transform([test.find()[n]["profile"]]).toarray())
		print type(profile_unigram)
		print "-"*20
		print profile_unigram
		print "-"*20
		pickle_profile_unigram = Binary(pickle.dumps(profile_unigram, protocol=2), subtype=128 )
		result = test.update_one({"author": tweetAuthor}, {"$set": {"profile_unigram": pickle_profile_unigram}})   
	"""
	"""
Exemplo n.º 7
0
    for line in inten_file.readlines():
        regex = r"\b" + re.escape(line) + r"\b"

        if re.findall(regex, text, re.IGNORECASE):
            intensifier = 1
            break
        else:
            continue

    return intensifier


#--------
# unigrams
# all_unigrams is the bag of word in unigrams
all_unigrams = BOW.Get_unigrams_bigrams(AllT.collect_text())[0]
vect1 = CountVectorizer(vocabulary=all_unigrams)
# bigrams
# all_bigrams is the bag of word in bigrams
all_bigrams = BOW.Get_unigrams_bigrams(AllT.collect_text())[1]
vect2 = CountVectorizer(vocabulary=all_bigrams)
start_time = datetime.datetime.now()

for i in range(dbtweets.find().count()):
    cur_time = datetime.datetime.now()
    delta = cur_time - start_time
    #print 'this is the ', i+1, 'tweet', 'total time is: ', delta
    tweet_id = dbtweets.find()[i]['tweet_id']
    tweet_text = dbtweets.find()[i]['tweet_text']

    number_no_vowels = Pron.count_number_no_vowels(tweet_text)
Exemplo n.º 8
0
	# import countvectorzier to do process and 
	# a built-in stop word list for English is used
	count_vect = CountVectorizer(stop_words='english')
	train_counts = count_vect.fit_transform(tweets)
	
	vocab = count_vect.vocabulary_.keys()
	train_counts = (train_counts).toarray()

	return (vocab, train_counts)

def Get_unigrams_bigrams(tweets):
	# import countvectorzier to generate unigrams and bigrams
	unicount_vect = CountVectorizer(ngram_range=(1,1), lowercase = False,  stop_words='english',  token_pattern=r'\b\w+\b', min_df=1)
	unicount = unicount_vect.fit_transform(tweets).toarray() 
	unigrams = unicount_vect.get_feature_names()


	bicount_vect = CountVectorizer(ngram_range=(2,2), lowercase = False, stop_words='english',  token_pattern=r'\b\w+\b', min_df=1)
	bicount = bicount_vect.fit_transform(tweets).toarray() 
	bigrams = bicount_vect.get_feature_names()
	
	return (unigrams, bigrams)


all_tweets_grams = Get_unigrams_bigrams(AllT.collect_text())

if __name__ == '__main__':
	print "Running as a file, not as imported"
	print all_tweets_grams[0][0:15]
	print all_tweets_grams[1][0:15]
Exemplo n.º 9
0
		regex = r"\b" + re.escape(line) + r"\b"

		if re.findall(regex, text, re.IGNORECASE):
			intensifier = 1
			break
		else:
			continue
	
	return intensifier



#--------
# unigrams
# all_unigrams is the bag of word in unigrams
all_unigrams = BOW.Get_unigrams_bigrams(AllT.collect_text())[0]
vect1 = CountVectorizer(vocabulary=all_unigrams)
# bigrams
# all_bigrams is the bag of word in bigrams
all_bigrams = BOW.Get_unigrams_bigrams(AllT.collect_text())[1]  
vect2 = CountVectorizer(vocabulary=all_bigrams)
start_time = datetime.datetime.now()


for i in range(dbtweets.find().count()):
	cur_time = datetime.datetime.now()
	delta = cur_time - start_time
	#print 'this is the ', i+1, 'tweet', 'total time is: ', delta
	tweet_id = dbtweets.find()[i]['tweet_id']
	tweet_text = dbtweets.find()[i]['tweet_text']