예제 #1
0
def clean_And_Parse_tweets(tweets):
	# parsing and adding bag of words to the tweet_words for each tweet
	for tweet in tweets:
		tweet['tweet_words']=tweet_tokenizer(tweet['tweet_text'])
	# Get the content from the webpage
	for tweet in tweets:
		if tweet['tweet_urls']!="":
			webpage = Words_In_Webpage(tweet['tweet_urls'])
			tweet['tweet_webpage_words']= webcontent_tokenizer(webpage)
	return tweets
예제 #2
0
def clean_And_Parse_tweets(tweets):
	# parsing and adding bag of words to the tweet_words for each tweet
	for tweet in tweets:
		tweet['tweet_words']=tweet_tokenizer(tweet['tweet_text'])
	# Get the content from the webpage
	for tweet in tweets:
		tweet["word_vector"] = {}
		wordvecfinal = {}
		wordvec = {}
		for word in tweet['tweet_words']:
			wordvec.setdefault(word,0)
			wordvec[word]+=1
		if tweet['tweet_urls']!="":
			webpage = Words_In_Webpage(tweet['tweet_urls'])
			tweet['tweet_webpage_words']= webcontent_tokenizer(webpage)
			for word in tweet['tweet_webpage_words']:
				wordvec.setdefault(word,0)
				wordvec[word]+=1
			for word,wordcount in wordvec.items():
				#frac = float(wordcount)/len(wordvec)
				#if frac >0:
				wordvecfinal[word] = wordcount
		tweet["word_vector"] = wordvecfinal
	return tweets