Exemplo n.º 1
0
def main(tweet, tweetSpots):
    wordsFreq = {}
    allWords = []
    spotsWords = []

    tweetWords = tweet.split(" ")
    tweetWords += tweetSpots.keys()
    for word in tweetWords:
        originWord = word
        if "_" not in word:
            word = stemming.main(word)
        else:
            spotsWordsRaw = word.split("_")
            spotsWords = [
                stemming.main(spotword) for spotword in spotsWordsRaw
                if stemming.main(spotword) != ""
                and "http" not in stemming.main(spotword)
            ]
        if word != "" and "http" not in word:
            if "_" not in word:
                allWords.append(word)
        allWords += spotsWords

    for word in allWords:
        if word not in wordsFreq:
            wordsFreq[word] = 1
        else:
            wordsFreq[word] += 1
    total = sum(wordsFreq.values())
    for word in wordsFreq:
        wordsFreq[word] /= float(total)
    return wordsFreq
Exemplo n.º 2
0
def calculate(givenWord,loop,first,second):
	wordId1 = getId.main(stemming.main(givenWord))
	for tweetId in tweetIdList[first:second]:
		oneTweetWords = tweetId_words[tweetId]
		for word in oneTweetWords:
			wordId2 = getId.main(stemming.main(word))
			co = cooccurrences.main(wordId1,wordId2)
			if co > 0:
				similarityWord = getWordsSimilarity.main(wordId1,wordId2)
				if similarityWord == None:
					vector2 = json.load(open("/data/CikmTwitterProject/Paper/SearchTweets/WordsDistribution#5/"+str(wordId2)))
					similarityWord = countSim(vector1,vector2)
					storeWordsSimilarity.main(wordId1,wordId2,similarityWord)
Exemplo n.º 3
0
def main(title):
    documents = getAmbiguousPages.main(title)
    wikiVector_hash = {}
    for c in documents:
        wikiTitle = c.capitalize()
        wikiTitle = wikiTitle.replace("(", "\(")
        wikiTitle = wikiTitle.replace(")", "\)")
        wikiTitle = wikiTitle.replace("_", " ")
        wordsFreq = {}
        try:
            p = wikipedia.page(wikiTitle)
            content = p.content

            extractList = content.split(" ")
            extractStemmedList = []
            for word in extractList:
                word = stemming.main(word)
                if word != "":
                    extractStemmedList.append(word)
            for word in extractStemmedList:
                if word not in wordsFreq:
                    wordsFreq[word] = 1
                else:
                    wordsFreq[word] += 1
            #total = sum(wordsFreq.values())
            total = len(extractStemmedList)
            for word in wordsFreq:
                wordsFreq[word] /= float(total)

        except:
            pass
        wikiVector_hash[c] = wordsFreq
    return wikiVector_hash
Exemplo n.º 4
0
def main(title):
	documents = getAmbiguousPages.main(title) 
	wikiVector_hash = {}
	for c in documents:
			wikiTitle = c.capitalize()
			wikiTitle = wikiTitle.replace("(", "\(")
			wikiTitle = wikiTitle.replace(")", "\)")
			wikiTitle = wikiTitle.replace("_", " ")
			wordsFreq = {}
			try:
					p = wikipedia.page(wikiTitle)
					content = p.content

					extractList = content.split(" ")
					extractStemmedList = []
					for word in extractList:
						word = stemming.main(word)
						if word != "":
							extractStemmedList.append(word)
					for word in extractStemmedList:
						if word not in wordsFreq:
							wordsFreq[word] = 1
						else:
							wordsFreq[word] += 1
					#total = sum(wordsFreq.values())
					total = len(extractStemmedList)
					for word in wordsFreq:
						wordsFreq[word] /= float(total)
				
			except:
				pass
			wikiVector_hash[c] = wordsFreq
	return wikiVector_hash
Exemplo n.º 5
0
def main(givenWord):
	db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','CikmTwitterDataSet')
	wordId = getId.main(stemming.main(givenWord))
	global tweetIdList
	global tweetId_words
	global vector1
	tweetId_words = {}
	while(db_mysql.open != 1):
		db_mysql.ping()
	cursor = db_mysql.cursor()
	# removed words itself and love, like, day, lol, today, tomorrow, time, tonight, thing, ready, found,free
	getTweetWords = "select A.tweetId,w.word from Words w join (select c.tweetId,tw.wordId from " + givenWord + " c join TweetsWords tw on (c.TweetId = tw.TweetId)) A on (w.id = A.wordId) where A.wordId !=" + str(wordId) + " and A.wordId != 1804507 and A.wordId != 1040690 and A.wordId != 1111170 and A.wordId != 991563 and A.wordId != 13304 and A.wordId != 3368935 and A.wordId != 2113819 and A.wordId != 1990840 and A.wordId != 2977454 and A.wordId != 3489500 and A.wordId != 1326944 and A.wordId != 419686"
	cursor.execute(getTweetWords)
	resultsRaw = cursor.fetchall()
	cursor.close()
	db_mysql.close()
	for result in resultsRaw:
		tweetId = result[0]
		word = result[1]
		if tweetId not in tweetId_words:
			tweetId_words[tweetId] = []
		tweetId_words[tweetId].append(word)


	vector1 = json.load(open("/data/CikmTwitterProject/Paper/SearchTweets/WordsDistribution#5/"+str(wordId)))


	pids = []
	tweetIdList = tweetId_words.keys()
	print "length:",len(tweetIdList)
	time.sleep(1)
	tweetIdList.sort()
	calculate(givenWord,0,0,len(tweetIdList))
Exemplo n.º 6
0
def calculate(givenWord,loop,first,second):
	fd = open("Luna"+givenWord+"RemoveCo0"+str(loop),"w+")
	wordId1 = getId.main(stemming.main(givenWord))
	for tweetId in tweetIdList[first:second]:
		similarityFinal = 0
		oneTweetWords = tweetId_words[tweetId]
		length = 0
		length = len(oneTweetWords)
		for word in oneTweetWords:
			wordId2 = getId.main(stemming.main(word))
			co = cooccurrences.main(wordId1,wordId2)
			if co > 0:
				similarityWord = getWordsRefinedSimilarity.main(wordId1,wordId2)
				similarityFinal += float(similarityWord)
		similarityFinal = float(similarityFinal)/length
		fd.write(str(tweetId))
		fd.write("\t")
		fd.write(str(float(similarityFinal)))
		fd.write("\n")
		fd.flush()
Exemplo n.º 7
0
def main(givenWord):
	wordId = getId.main(stemming.main(givenWord))
	db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','CikmTwitterDataSet')
	db_mysql.ping()
	cursor = db_mysql.cursor()
	Max = 2.62507
	update = "UPDATE WordsSimilarity SET RefinedSimilarity=(" + str(Max) + "-ABS(Log(10,Similarity)))*10 WHERE RefinedSimilarity IS NULL"  
	print update
	cursor.execute(update)
	db_mysql.commit()
	cursor.close()
	db_mysql.close()
Exemplo n.º 8
0
def main(givenWord):
    wordId = getId.main(stemming.main(givenWord))
    db_mysql = MySQLdb.connect('141.117.3.92', 'lunafeng', 'luna222',
                               'CikmTwitterDataSet')
    db_mysql.ping()
    cursor = db_mysql.cursor()
    Max = 2.62507
    update = "UPDATE WordsSimilarity SET RefinedSimilarity=(" + str(
        Max) + "-ABS(Log(10,Similarity)))*10 WHERE RefinedSimilarity IS NULL"
    print update
    cursor.execute(update)
    db_mysql.commit()
    cursor.close()
    db_mysql.close()
Exemplo n.º 9
0
def main(givenWord):
	wordId = getId.main(stemming.main(givenWord))
	global tweetIdList
	global tweetId_words
	global vector1
	tweetId_words = {}
	db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','CikmTwitterDataSet')
	db_mysql.ping()
	cursor = db_mysql.cursor()
	getTweetWords = "select A.tweetId,w.word from Words w join (select c.tweetId,tw.wordId from " + givenWord + " c join TweetsWords tw on (c.TweetId = tw.TweetId)) A on (w.id = A.wordId) where A.wordId !=" + str(wordId)
	cursor.execute(getTweetWords)
	resultsRaw = cursor.fetchall()
	for result in resultsRaw:
		tweetId = result[0]
		word = result[1]
		if tweetId not in tweetId_words:
			tweetId_words[tweetId] = []
		tweetId_words[tweetId].append(word)
	cursor.close()
	db_mysql.close()


	vector1 = json.load(open("/data/CikmTwitterProject/Paper/SearchTweets/WordsDistribution#5/"+str(wordId)))


	pids = []
	tweetIdList = tweetId_words.keys()
	tweetIdList.sort()
	for loop in range(8):
		pid = os.fork()
		pids.append(pid)
		if pid == 0 :
			first = loop*len(tweetIdList)/8
			second = (loop+1)*len(tweetIdList)/8
			calculate(givenWord,loop,first,second)
			os._exit(0)
		else:
			continue
	for pid in pids:
		os.waitpid(pid,0)
Exemplo n.º 10
0
#!/usr/bin/python
import MySQLdb
import getId, stemming


def main(word1, word2):
    db_mysql = MySQLdb.connect('141.117.3.92', 'lunafeng', 'luna222',
                               'CikmTwitterDataSet')
    db_mysql.ping()
    cursor = db_mysql.cursor()
    sql = "SELECT Similarity FROM WordsSimilarity WHERE Word1=" + str(
        word1) + " AND Word2=" + str(word2)
    try:
        cursor.execute(sql)
        similarity = cursor.fetchone()
    except:
        db_mysql.rollback()
    cursor.close()
    db_mysql.close()
    try:
        similarity = similarity[0]
        return similarity
    except:
        return None


words1 = getId.main(stemming.main("popcorn"))
words2 = getId.main(stemming.main("microwave"))
print main(words1, words2)
Exemplo n.º 11
0
 for senseId in senses:
     max = 0
     match = ""
     finalMatch = ""
     senseWords = senses[senseId]
     for article in tfidf:
         sum = 0
         count = 0
         tfidfwords = tfidf[article]
         article_length = len(tfidfwords)
         for word in senseWords:
             try:
                 if "_" in word:
                     wordList = word.split("_")
                     for word in wordList:
                         if stemming.main(word) in tfidfwords:
                             count += 1
                 else:
                     if stemming.main(word) in tfidfwords:
                         count += 1
             except:
                 pass
         if article_length != 0:
             sum = float(count) / article_length
         else:
             sum = float(0)
         if sum > max:
             max = sum
             match = article
     if match == "":
         finalMatch = None
Exemplo n.º 12
0
import stemming
import BM25
import QUERY_LIKELIHOOD
import TF_IDF

if __name__ == '__main__':
    stemming.main()
    BM25.main()
    QUERY_LIKELIHOOD.main()
    TF_IDF.main()
Exemplo n.º 13
0
						max = 0
						match = ""
						finalMatch = ""
						senseWords = senses[senseId]
						for article in tfidf:
							print "wiki:", article
							sum = 0
							count = 0
							tfidfwords = tfidf[article] 
							article_length = len(tfidfwords)
							for word in senseWords:
								try:
									if "_" in word:
										wordList = word.split("_")
										for word in wordList:
											if stemming.main(word) in tfidfwords:
												count += 1
											sum += tfidfwords[stemming.main(word)]
									else:
										if stemming.main(word) in tfidfwords:
											count += 1
										sum += tfidfwords[stemming.main(word)]
								except:
									pass
							print "overlap:", count
							print "length:", article_length
							if article_length != 0:
								sum *= (float(count) ** 1.5)/(article_length)
							else:
								sum = float(0)
							print "sum final:", sum
#!/usr/bin/python
import MySQLdb
import getId,stemming

def main(word1,word2):
	db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','CikmTwitterDataSet')
	db_mysql.ping()
	cursor = db_mysql.cursor()
	sql = "SELECT Similarity FROM WordsSimilarity WHERE Word1=" + str(word1) + " AND Word2=" + str(word2)
	try:
		cursor.execute(sql)
		similarity = cursor.fetchone()
	except:
		db_mysql.rollback()
	cursor.close()
	db_mysql.close()
	try:
		similarity = similarity[0]
		return similarity
	except:
		return None


words1 = getId.main(stemming.main("popcorn"))
words2 = getId.main(stemming.main("microwave"))
print main(words1,words2)