def main(tweet, tweetSpots): wordsFreq = {} allWords = [] spotsWords = [] tweetWords = tweet.split(" ") tweetWords += tweetSpots.keys() for word in tweetWords: originWord = word if "_" not in word: word = stemming.main(word) else: spotsWordsRaw = word.split("_") spotsWords = [ stemming.main(spotword) for spotword in spotsWordsRaw if stemming.main(spotword) != "" and "http" not in stemming.main(spotword) ] if word != "" and "http" not in word: if "_" not in word: allWords.append(word) allWords += spotsWords for word in allWords: if word not in wordsFreq: wordsFreq[word] = 1 else: wordsFreq[word] += 1 total = sum(wordsFreq.values()) for word in wordsFreq: wordsFreq[word] /= float(total) return wordsFreq
def calculate(givenWord,loop,first,second): wordId1 = getId.main(stemming.main(givenWord)) for tweetId in tweetIdList[first:second]: oneTweetWords = tweetId_words[tweetId] for word in oneTweetWords: wordId2 = getId.main(stemming.main(word)) co = cooccurrences.main(wordId1,wordId2) if co > 0: similarityWord = getWordsSimilarity.main(wordId1,wordId2) if similarityWord == None: vector2 = json.load(open("/data/CikmTwitterProject/Paper/SearchTweets/WordsDistribution#5/"+str(wordId2))) similarityWord = countSim(vector1,vector2) storeWordsSimilarity.main(wordId1,wordId2,similarityWord)
def main(title): documents = getAmbiguousPages.main(title) wikiVector_hash = {} for c in documents: wikiTitle = c.capitalize() wikiTitle = wikiTitle.replace("(", "\(") wikiTitle = wikiTitle.replace(")", "\)") wikiTitle = wikiTitle.replace("_", " ") wordsFreq = {} try: p = wikipedia.page(wikiTitle) content = p.content extractList = content.split(" ") extractStemmedList = [] for word in extractList: word = stemming.main(word) if word != "": extractStemmedList.append(word) for word in extractStemmedList: if word not in wordsFreq: wordsFreq[word] = 1 else: wordsFreq[word] += 1 #total = sum(wordsFreq.values()) total = len(extractStemmedList) for word in wordsFreq: wordsFreq[word] /= float(total) except: pass wikiVector_hash[c] = wordsFreq return wikiVector_hash
def main(givenWord): db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','CikmTwitterDataSet') wordId = getId.main(stemming.main(givenWord)) global tweetIdList global tweetId_words global vector1 tweetId_words = {} while(db_mysql.open != 1): db_mysql.ping() cursor = db_mysql.cursor() # removed words itself and love, like, day, lol, today, tomorrow, time, tonight, thing, ready, found,free getTweetWords = "select A.tweetId,w.word from Words w join (select c.tweetId,tw.wordId from " + givenWord + " c join TweetsWords tw on (c.TweetId = tw.TweetId)) A on (w.id = A.wordId) where A.wordId !=" + str(wordId) + " and A.wordId != 1804507 and A.wordId != 1040690 and A.wordId != 1111170 and A.wordId != 991563 and A.wordId != 13304 and A.wordId != 3368935 and A.wordId != 2113819 and A.wordId != 1990840 and A.wordId != 2977454 and A.wordId != 3489500 and A.wordId != 1326944 and A.wordId != 419686" cursor.execute(getTweetWords) resultsRaw = cursor.fetchall() cursor.close() db_mysql.close() for result in resultsRaw: tweetId = result[0] word = result[1] if tweetId not in tweetId_words: tweetId_words[tweetId] = [] tweetId_words[tweetId].append(word) vector1 = json.load(open("/data/CikmTwitterProject/Paper/SearchTweets/WordsDistribution#5/"+str(wordId))) pids = [] tweetIdList = tweetId_words.keys() print "length:",len(tweetIdList) time.sleep(1) tweetIdList.sort() calculate(givenWord,0,0,len(tweetIdList))
def calculate(givenWord,loop,first,second): fd = open("Luna"+givenWord+"RemoveCo0"+str(loop),"w+") wordId1 = getId.main(stemming.main(givenWord)) for tweetId in tweetIdList[first:second]: similarityFinal = 0 oneTweetWords = tweetId_words[tweetId] length = 0 length = len(oneTweetWords) for word in oneTweetWords: wordId2 = getId.main(stemming.main(word)) co = cooccurrences.main(wordId1,wordId2) if co > 0: similarityWord = getWordsRefinedSimilarity.main(wordId1,wordId2) similarityFinal += float(similarityWord) similarityFinal = float(similarityFinal)/length fd.write(str(tweetId)) fd.write("\t") fd.write(str(float(similarityFinal))) fd.write("\n") fd.flush()
def main(givenWord): wordId = getId.main(stemming.main(givenWord)) db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','CikmTwitterDataSet') db_mysql.ping() cursor = db_mysql.cursor() Max = 2.62507 update = "UPDATE WordsSimilarity SET RefinedSimilarity=(" + str(Max) + "-ABS(Log(10,Similarity)))*10 WHERE RefinedSimilarity IS NULL" print update cursor.execute(update) db_mysql.commit() cursor.close() db_mysql.close()
def main(givenWord): wordId = getId.main(stemming.main(givenWord)) db_mysql = MySQLdb.connect('141.117.3.92', 'lunafeng', 'luna222', 'CikmTwitterDataSet') db_mysql.ping() cursor = db_mysql.cursor() Max = 2.62507 update = "UPDATE WordsSimilarity SET RefinedSimilarity=(" + str( Max) + "-ABS(Log(10,Similarity)))*10 WHERE RefinedSimilarity IS NULL" print update cursor.execute(update) db_mysql.commit() cursor.close() db_mysql.close()
def main(givenWord): wordId = getId.main(stemming.main(givenWord)) global tweetIdList global tweetId_words global vector1 tweetId_words = {} db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','CikmTwitterDataSet') db_mysql.ping() cursor = db_mysql.cursor() getTweetWords = "select A.tweetId,w.word from Words w join (select c.tweetId,tw.wordId from " + givenWord + " c join TweetsWords tw on (c.TweetId = tw.TweetId)) A on (w.id = A.wordId) where A.wordId !=" + str(wordId) cursor.execute(getTweetWords) resultsRaw = cursor.fetchall() for result in resultsRaw: tweetId = result[0] word = result[1] if tweetId not in tweetId_words: tweetId_words[tweetId] = [] tweetId_words[tweetId].append(word) cursor.close() db_mysql.close() vector1 = json.load(open("/data/CikmTwitterProject/Paper/SearchTweets/WordsDistribution#5/"+str(wordId))) pids = [] tweetIdList = tweetId_words.keys() tweetIdList.sort() for loop in range(8): pid = os.fork() pids.append(pid) if pid == 0 : first = loop*len(tweetIdList)/8 second = (loop+1)*len(tweetIdList)/8 calculate(givenWord,loop,first,second) os._exit(0) else: continue for pid in pids: os.waitpid(pid,0)
#!/usr/bin/python import MySQLdb import getId, stemming def main(word1, word2): db_mysql = MySQLdb.connect('141.117.3.92', 'lunafeng', 'luna222', 'CikmTwitterDataSet') db_mysql.ping() cursor = db_mysql.cursor() sql = "SELECT Similarity FROM WordsSimilarity WHERE Word1=" + str( word1) + " AND Word2=" + str(word2) try: cursor.execute(sql) similarity = cursor.fetchone() except: db_mysql.rollback() cursor.close() db_mysql.close() try: similarity = similarity[0] return similarity except: return None words1 = getId.main(stemming.main("popcorn")) words2 = getId.main(stemming.main("microwave")) print main(words1, words2)
for senseId in senses: max = 0 match = "" finalMatch = "" senseWords = senses[senseId] for article in tfidf: sum = 0 count = 0 tfidfwords = tfidf[article] article_length = len(tfidfwords) for word in senseWords: try: if "_" in word: wordList = word.split("_") for word in wordList: if stemming.main(word) in tfidfwords: count += 1 else: if stemming.main(word) in tfidfwords: count += 1 except: pass if article_length != 0: sum = float(count) / article_length else: sum = float(0) if sum > max: max = sum match = article if match == "": finalMatch = None
import stemming import BM25 import QUERY_LIKELIHOOD import TF_IDF if __name__ == '__main__': stemming.main() BM25.main() QUERY_LIKELIHOOD.main() TF_IDF.main()
max = 0 match = "" finalMatch = "" senseWords = senses[senseId] for article in tfidf: print "wiki:", article sum = 0 count = 0 tfidfwords = tfidf[article] article_length = len(tfidfwords) for word in senseWords: try: if "_" in word: wordList = word.split("_") for word in wordList: if stemming.main(word) in tfidfwords: count += 1 sum += tfidfwords[stemming.main(word)] else: if stemming.main(word) in tfidfwords: count += 1 sum += tfidfwords[stemming.main(word)] except: pass print "overlap:", count print "length:", article_length if article_length != 0: sum *= (float(count) ** 1.5)/(article_length) else: sum = float(0) print "sum final:", sum
#!/usr/bin/python import MySQLdb import getId,stemming def main(word1,word2): db_mysql = MySQLdb.connect('141.117.3.92','lunafeng','luna222','CikmTwitterDataSet') db_mysql.ping() cursor = db_mysql.cursor() sql = "SELECT Similarity FROM WordsSimilarity WHERE Word1=" + str(word1) + " AND Word2=" + str(word2) try: cursor.execute(sql) similarity = cursor.fetchone() except: db_mysql.rollback() cursor.close() db_mysql.close() try: similarity = similarity[0] return similarity except: return None words1 = getId.main(stemming.main("popcorn")) words2 = getId.main(stemming.main("microwave")) print main(words1,words2)