def formCSVFromGeoInMessages(outputDir, portInput): from MongoDBInterface import getMongoClient client = getMongoClient(portInput) db_name = "Temp_Analysis" db = client[db_name] collectionName = "TokenToCoordinates_Combined3Div" from MongoDBInterface import getMongoClient client = getMongoClient(portInput) db = client[db_name] collectionToRead = db[collectionName] query = {} tweetCursor = collectionToRead.find(query, no_cursor_timeout=True) rows = [[ 'id', "label", "Americas", "Africa_Europe", "Asia_Australia", "ratioAmericas", "ratioAfrica_Europe", "ratioAsia_Australia", "ratioMax", "total" ]] import numpy as np for userInfo in tweetCursor: values = userInfo['coordinates'] l1 = 0 l2 = 0 l3 = 0 for value in values: if value < 0: l1 += 1 elif value == 0: l2 += 1 else: l3 += 1 Americas = l1 Africa_Europe = l2 Asia_Australia = l3 ratioAmericas = float(Americas) / float(len(values)) ratioAfrica_Europe = float(Africa_Europe) / float(len(values)) ratioAsia_Australia = float(Asia_Australia) / float(len(values)) ratioMax = np.max( [ratioAmericas, ratioAfrica_Europe, ratioAsia_Australia]) label = None if ratioAmericas == ratioMax: label = "Americas" elif ratioAfrica_Europe == ratioMax: label = "Africa_Europe" else: label = "Asia_Australia" #token = userInfo['_id'].encode("utf-8") #if token.startswith('@') or token.startswith('#'): rows.append([ userInfo['_id'].encode("utf-8"), label, Americas, Africa_Europe, Asia_Australia, ratioAmericas, ratioAfrica_Europe, ratioAsia_Australia, ratioMax, len(values) ]) fileToStoreTo = outputDir + "combineDBsCoordinateGroundTruthDiv3.csv" from Step2ProcessTableOfTweets import writeRowsToCSV writeRowsToCSV(rows, fileToStoreTo)
def performRawCollection(db_name, portIn): port = portIn global db global collectionName from tweepy import Stream from TwitterAPI import getAPI twitterAPI1 = getAPI() twitterStream = Stream(auth=twitterAPI1.auth, listener=listener()) from MongoDBInterface import getMongoClient client = getMongoClient(port) db = client[db_name] collectionName = "Day1" print("Start day 1. " + "Writing to:" + db_name + "," + str(collectionName)) twitterStream.sample(async=True) import time dayCount = 2 while True: time.sleep(86400) print("Finished collection for day:" + str(dayCount-1)) print("Starting day " + str(dayCount)) collectionName = "Day"+str(dayCount) print("Start day " + str(dayCount) + ". Writing to:" + db_name + "," + str(collectionName)) dayCount += 1
def mainProcessFollowers(api, db_name, collectionName, screenNames, influencerScreenNamesLowerCaseToProper, portInput, maxFollowersToCollect=1000, filePathToStoreToInput=None): from datetime import datetime print("Starting Collecting Followers with Follower Sample size = " + str(maxFollowersToCollect) + " " + str(datetime.now())) from MongoDBInterface import getMongoClient client = getMongoClient(portInput) db = client[db_name] collectionToWrite = db[collectionName] from MongoDBInterface import getUsersUnwrittenToMongoDB usersUnwritten = getUsersUnwrittenToMongoDB( collectionToWrite, screenNames, "_id", influencerScreenNamesLowerCaseToProper) print("Users unwritten " + str(len(usersUnwritten))) for screenName in usersUnwritten: if filePathToStoreToInput != None: getFollowerIDs(api, screenName, maxFollowersToCollect, collectionToWrite, filePathToStoreToInput) else: getFollowerIDs(api, screenName, maxFollowersToCollect, collectionToWrite) print("Finished Collecting Followers " + str(datetime.now()))
def getHoursAccordingToFollowerOrder(db_name, outputDirFollower, portInput=None): global port if portInput != None: port = portInput from MongoDBInterface import getMongoClient client = getMongoClient(port) db = client[db_name] collectionToRead = db["followerInfo"] import os hoursInOrder = [] from MongoDBInterface import getMongoClient client = getMongoClient(port) dbnames = client.list_database_names() if db_name in dbnames: db = client[db_name] if "influencerOverWhichFollowerInfoCollected" in db.list_collection_names(): userInfoPath = outputDirFollower+str(db_name.lower())+".pickle" import pickle with open(userInfoPath, "rb") as fp: followers = pickle.load(fp) idToDate = {} query = {} tweetCursor = collectionToRead.find(query, no_cursor_timeout=True) for userInfo in tweetCursor: idToDate[str(userInfo["id_str"])] = userInfo["created_at"] hoursInOrder = [] datesInOrder = [] for follower in followers: if follower in idToDate: hoursInOrder.append(idToDate[follower].hour) datesInOrder.append(idToDate[follower]) print(str(len(hoursInOrder)) + " account creation times in order") if len(hoursInOrder) == 0: print("Error could not load any hours") import sys sys.exit() return hoursInOrder, datesInOrder
def mainProcessScreenNames(api, db_name, collectionName, screenNames, portInput, writeDescription=False): from datetime import datetime print("Starting Collecting User Info " + str(datetime.now()) + " into collection " + collectionName) from MongoDBInterface import getMongoClient if portInput != None: client = getMongoClient(portInput) else: client = getMongoClient() db = client[db_name] collectionToWrite = db[collectionName] from MongoDBInterface import getUsersUnwrittenToMongoDB usersUnwritten = list(getUsersUnwrittenToMongoDB(collectionToWrite, screenNames, "screenName", {})) print(str(len(usersUnwritten)) + " users unwritten for part 1") getFollowerInfoFromListOfScreenNames(api, usersUnwritten, collectionToWrite, writeDescription) print("Finished Collecting User Info " + str(datetime.now()) + " into collection:" + collectionName)
def formCommunities(screenNames, outputDir, minFriend, maxFriend, maxFollower, portInput): screenNameToFollowers = {} for screenName in screenNames: userInfoPath = outputDir + str(screenName.lower()) + ".pickle" import pickle with open(userInfoPath, "rb") as fp: followers = pickle.load(fp) followersSTR = set([]) for follower in followers: try: followersSTR.add(str(follower)) except: print("Error loading follower") followers = followersSTR screenNameToFollowers[screenName] = followers #iterate over possible pairs followersOfInterest = set([]) for i in range(0, len(screenNames), 1): for j in range(0, len(screenNames), 1): if i < j: mutualFollowers = screenNameToFollowers[ screenNames[i]].intersection( screenNameToFollowers[screenNames[j]]) followersOfInterest = followersOfInterest.union( mutualFollowers) print( str(len(followersOfInterest)) + " followers of interest by iterating over every pair of influencers") from MongoDBInterface import getMongoClient client = getMongoClient(portInput) finalCommunity = set([]) for db_name in screenNames: db = client[db_name] collection = db["followerInfo"] tweetCursor = collection.find({}, no_cursor_timeout=True) fieldsOfInterest = ['followers_count', 'friends_count'] for userInfo in tweetCursor: if str(userInfo['id_str']) in followersOfInterest: if (userInfo['followers_count'] <= maxFollower and userInfo['friends_count'] <= maxFriend and userInfo['friends_count'] >= minFriend): finalCommunity.add(str(userInfo['id_str'])) print( str(len(finalCommunity)) + " final community after applying thresholds") return finalCommunity
def mainProcessFriends(api, db_name, collectionName, ids, portInput): from datetime import datetime print("Starting Collecting Friends") from MongoDBInterface import getMongoClient client = getMongoClient(portInput) db = client[db_name] collectionToWrite = db[collectionName] from MongoDBInterface import getUsersUnwrittenToMongoDB usersUnwritten = getUsersUnwrittenToMongoDB(collectionToWrite, ids, "_id", {}) print("Users unwritten " + str(len(usersUnwritten))) for strID in usersUnwritten: getFriendsIDs(api, strID, collectionToWrite) print("Finished Collecting Friends " + str(datetime.now()))
def writeTopNVectors(communityToCommunityFollows, dictionary, weightedVectors, N, portInput, outputDir): for label in weightedVectors: weightsTemp = weightedVectors[label] communityWeights = {} for indexToWeightTuple in weightsTemp: communityWeights[indexToWeightTuple[0]] = indexToWeightTuple[1] topNIndexes, topNToCount = getTopNFromDict(communityWeights, N) topNNames = [] for index in topNIndexes: topNNames.append(dictionary[index]) db_name = label collectionName = "friendInfo" print(label) print(topNNames) from MongoDBInterface import getMongoClient client = getMongoClient(portInput) db = client[db_name] collectionToRead = db[collectionName] tweetCursor = collectionToRead.find({}, no_cursor_timeout=True) fieldsOfInterest = [ 'screenName', 'followers_count', 'location', 'name', 'created_at', 'description' ] nameToRow = {} for userInfo in tweetCursor: if str(userInfo["id_str"]) in topNNames: row = [ communityToCommunityFollows[label][str(userInfo["id_str"])] ] for field in fieldsOfInterest: row.append(userInfo[field]) nameToRow[str(userInfo["id_str"])] = row rows = [['FollowsByCommunity'] + fieldsOfInterest] for name in topNNames: rows.append(nameToRow[name]) from Main import writeRowsToCSV writeRowsToCSV(rows, outputDir + label + "TopNusingTFIDF.csv")
def generateFrequencyOfTokens(db_name, collectionName, outputDir, port): from MongoDBInterface import getMongoClient client = getMongoClient(port) db = client[db_name] import os if not os.path.isdir(outputDir): os.mkdir(outputDir) fileToStoreTo = outputDir + "tokenToFrequency_" + str(db_name) + "_" + str( collectionName) + ".pickle" if not os.path.isfile(fileToStoreTo): collectionToRead = db[collectionName] query = {} fields = {'tweetOriginal': 1} tweetCursor = collectionToRead.find(query, fields, no_cursor_timeout=True) tokenToFrequency = {} for userInfo in tweetCursor: tokens = processString(userInfo["tweetOriginal"]) for token in tokens: if not token in tokenToFrequency: tokenToFrequency[token] = 0 tokenToFrequency[token] += 1 import pickle with open(fileToStoreTo, "wb") as fp: pickle.dump(tokenToFrequency, fp) else: import pickle with open(fileToStoreTo, "rb") as fp: tokenToFrequency = pickle.load(fp) print("loaded " + str(len(tokenToFrequency)) + " tokens.") return tokenToFrequency
def getTopNCommunityFollowsInCSV(db_name, collectionName, portInput, usersCommunityFollows, N): from operator import itemgetter res = sorted(usersCommunityFollows.items(), key=itemgetter(1), reverse=True)[:N] topN = [] for pair in res: topN.append(pair[0]) print(topN) print(res) from MongoDBInterface import getMongoClient client = getMongoClient(portInput) db = client[db_name] collectionToRead = db[collectionName] tweetCursor = collectionToRead.find({}, no_cursor_timeout=True) fieldsOfInterest = [ 'screenName', 'followers_count', 'location', 'name', 'created_at', 'description' ] nameToRow = {} topNSet = set(topN) for userInfo in tweetCursor: if str(userInfo["id_str"]) in topNSet: row = [usersCommunityFollows[str(userInfo["id_str"])]] for field in fieldsOfInterest: row.append(userInfo[field]) nameToRow[str(userInfo["id_str"])] = row rows = [['FollowsByCommunity'] + fieldsOfInterest] for name in topN: rows.append(nameToRow[name]) writeRowsToCSV(rows, outputDir + db_name + "TopNMostFrequentlyFollowed.csv")
def mainProcessIDs(api, db_name, collectionName, ids, collectionNameWritesPerformed, portInput, writeDescription=False): from datetime import datetime print("Starting Collecting User Info " + str(datetime.now())) from MongoDBInterface import getMongoClient client = getMongoClient(portInput) db = client[db_name] collectionToWrite = db[collectionName] tweetCursor = collectionToWrite.find({}, no_cursor_timeout=True) userAlreadyWritten = set([]) for userInfo in tweetCursor: userAlreadyWritten.add(str(userInfo["id_str"])) if len(userAlreadyWritten) > 0: print(str(len(userAlreadyWritten)) + " userAlreadyWritten") count = 0 for uid in ids[::-1]: #reversed list if uid in userAlreadyWritten: break count += 1 ids = ids[-count:] print(len(ids)) getFollowerInfoFromListOfIDs(api, ids, collectionToWrite, writeDescription) infoToWrite = [] d = {} d["_id"] = db_name infoToWrite.append(d) from datetime import datetime try: db[collectionNameWritesPerformed].insert_many(infoToWrite, ordered=False) except: print("Error when doing bulk write") print("Finished Collecting User Info " + str(datetime.now()))
def setupDBUsingSingleUser(twitterAPI1, screenName, maxFollowersToCollectInput, followersDir, portInput, reprocess=False, writeFollowerOnly=False): from os import path outputDirFollower = 'CollectFollowers/' if followersDir != None: outputDirFollower = followersDir import os if not os.path.isdir(outputDirFollower): os.mkdir(outputDirFollower) from MongoDBInterface import getMongoClient client = getMongoClient(portInput) dbnames = client.list_database_names() collectionNeedBePerformed = True if screenName in dbnames: db = client[screenName] if "influencerOverWhichFollowerInfoCollected" in db.list_collection_names(): collectionNeedBePerformed = False if collectionNeedBePerformed: print("working on " + screenName) db_name = screenName screenNamesToQuery = [screenName] collectionName1 = "influencerInfo" collectionName2 = "influencerFollowerSample" collectionName3 = "followerInfo" collectionName3b = "influencerOverWhichFollowerInfoCollected" '''step1: collect influencer info''' from CollectUserInfo import mainProcessScreenNames db = client[db_name] mainProcessScreenNames(twitterAPI1, db_name, collectionName1, screenNamesToQuery, portInput) '''step2: collect a sample of followers for each influencer''' db = client[db_name] collectionToRead = db[collectionName1] from MongoDBInterface import getUsersWrittenToMongoDB influencerScreenNames = list(getUsersWrittenToMongoDB(collectionToRead, "screenName")) if len(influencerScreenNames) > 0: '''the screennames from Twitter may have a bit different capitalization, use the format as displayed by Twitter''' '''vs. original screenNames used in query''' influencerScreenNamesLowerCaseToProper = {} for influencerScreenName in influencerScreenNames: influencerScreenNamesLowerCaseToProper[influencerScreenName.lower()] = influencerScreenName screenNamesToQueryProper = [] for screenName in screenNamesToQuery: if screenName.lower() in influencerScreenNamesLowerCaseToProper: screenNamesToQueryProper.append(influencerScreenNamesLowerCaseToProper[screenName.lower()]) print("working with ", len(screenNamesToQueryProper), " out of ", len(screenNamesToQuery), " original screenNames") import os if not os.path.isdir(outputDirFollower): os.mkdir(outputDirFollower) from CollectFollowers import mainProcessFollowers if (not os.path.isfile(outputDirFollower+str(screenName.lower())+".pickle")) or reprocess: mainProcessFollowers(twitterAPI1, db_name, collectionName2, screenNamesToQueryProper, influencerScreenNamesLowerCaseToProper, portInput, maxFollowersToCollect=maxFollowersToCollectInput, filePathToStoreToInput=outputDirFollower+str(screenName.lower())+".pickle") if not writeFollowerOnly: '''step3: collect profile metadata on each follower''' from CollectUserInfo import mainProcessIDs userInfoPath = outputDirFollower+str(screenName.lower())+".pickle" import pickle with open(userInfoPath, "rb") as fp: followers = pickle.load(fp) followersSTR = [] for follower in followers: try: followersSTR.append(str(follower)) except: print("Error loading follower") followers = followersSTR print(str(len(followers)) + " followers loaded") mainProcessIDs(twitterAPI1, db_name, collectionName3, followers, collectionName3b, portInput)
def processTimeDistributions(minFreq, port, tokenToFrequencyGlobal, outputDir, atUser=False): tokenOfInterest = set([]) for token in tokenToFrequencyGlobal: if tokenToFrequencyGlobal[token] >= minFreq: tokenOfInterest.add(token) db_name = "Temp_Analysis" collectionName = "TimeDist_Combined" if atUser: collectionName = "TimeDist_Combined_AtUser" from MongoDBInterface import getMongoClient client = getMongoClient(port) db = client[db_name] collectionToRead = db[collectionName] query = {} tweetCursor = collectionToRead.find(query, no_cursor_timeout=True) collectionNameToWriteTo = "TokenTimeFeaturesProcessed" if atUser: collectionNameToWriteTo = "TokenTimeFeaturesProcessedUser" db = client["Temp_Analysis"] collectionName = collectionNameToWriteTo db[collectionName].drop() timeOfDay = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23' ] header = timeOfDay + [ 'indexOfMin', "indexOfMax", "stdOfTimeDist", 'rSquare', "power", "predUTC", 'totalRecords', 'id' ] import numpy as np infoToWrite = [] for userInfo in tweetCursor: timeDist = userInfo idToken = timeDist.pop('_id') if idToken in tokenOfInterest: if np.min(list(timeDist.values())) > 0: values = [] total = np.sum(list(timeDist.values())) for t in timeOfDay: if t in timeDist: values.append(float(timeDist[t]) / float(total)) else: values.append(0) stdOfTimeDist = np.std(values) resultsTemp = processTimeDist(values, 5, 33) indexOfMax = values.index(np.max(values)) indexOfMin = values.index(np.min(values)) PSTModified = None rSquare = None power = None if resultsTemp['twoPointTestPass']: if resultsTemp['parabola']: value = resultsTemp['predictedSleepTime'] if value >= 14: value = value - 24 value = -value + 4 PSTModified = value rSquare = resultsTemp['rSquare'] power = resultsTemp["power"] row = values + [ indexOfMin, indexOfMax, stdOfTimeDist, rSquare, power, PSTModified, total, idToken ] d = dict(zip(header, row)) d["_id"] = idToken infoToWrite.append(d) if len(infoToWrite) > 1000: try: db[collectionName].insert_many(infoToWrite, ordered=False) infoToWrite = [] print("Performed write") except: print("Error when doing bulk write")
def setupTempTableWithTimeDistributionforEachToken(db_nameAndCollection, minFreq, port, tokenToFrequencyGlobal, atUser=False): print("creating time distribution for each token") field = 'tHour' if atUser: field = 'tHour2' collectionNameToWriteTo = "TimeDist_Combined" if atUser: collectionNameToWriteTo = "TimeDist_Combined_AtUser" tokenOfInterest = set([]) for token in tokenToFrequencyGlobal: if tokenToFrequencyGlobal[token] >= minFreq: tokenOfInterest.add(token) print(str(len(tokenOfInterest)) + " tokenOfInterest") tokenToTimeDist = {} blankTimeDist = {} for hour in range(0, 24, 1): blankTimeDist[str(hour)] = 0 import copy usersProcessed = set([]) for pair in db_nameAndCollection: db_name = pair[0] collectionName = pair[1] from MongoDBInterface import getMongoClient client = getMongoClient(port) db = client[db_name] collectionToRead = db[collectionName] query = {} fields = {'tweetOriginal': 1, field: 1} if atUser: fields = {'tweetOriginal': 1, field: 1, 'username': 1} tweetCursor = collectionToRead.find(query, fields, no_cursor_timeout=True) if atUser: for userInfo in tweetCursor: if not userInfo["username"] in usersProcessed: usersProcessed.add(userInfo["username"]) tokens = processString(userInfo["tweetOriginal"]) for token in tokens: if token in tokenOfInterest: if not token in tokenToTimeDist: tokenToTimeDist[token] = copy.copy( blankTimeDist) tokenToTimeDist[token][str(userInfo[field])] += 1 else: for userInfo in tweetCursor: tokens = processString(userInfo["tweetOriginal"]) for token in tokens: if token in tokenOfInterest: if not token in tokenToTimeDist: tokenToTimeDist[token] = copy.copy(blankTimeDist) tokenToTimeDist[token][str(userInfo[field])] += 1 db = client["Temp_Analysis"] collectionName = collectionNameToWriteTo db[collectionName].drop() infoToWrite = [] import numpy as np for token in tokenToTimeDist: d = tokenToTimeDist[token] d["_id"] = token infoToWrite.append(d) if len(infoToWrite) > 1000: try: db[collectionName].insert_many(infoToWrite, ordered=False) infoToWrite = [] print("Performed write") except: print("Error when doing bulk write") if len(infoToWrite) > 0: try: db[collectionName].insert_many(infoToWrite, ordered=False) infoToWrite = [] print("Performed write") except: print("Error when doing bulk write") print("finished creating time distribution for each token")
if not os.path.isdir(outputDir): os.mkdir(outputDir) followersDir = "collectFollowers/" import os if not os.path.isdir(followersDir): os.mkdir(followersDir) from TwitterAPI import getAPI twitterAPI1 = getAPI() port = 27020 step0 = False if step0: print("applying google search") db_name = "TempInfluencersFromGoogleSearch" from MongoDBInterface import getMongoClient client = getMongoClient(port) from CollectUserInfo import mainProcessScreenNames db = client[db_name] queries1 = [ "Minsk Belarus Twitter", "Moscow Russia Twitter", "Moskva Russia Twitter" ] queries2 = ["Buffalo NY Twitter", "Syracuse NY Twitter"] queries = queries1 + queries2 import time for query in queries: potentialInfluencerToWebHit = googleSearch(query) print(potentialInfluencerToWebHit) collectionName = query.replace(" ", "")
def tokenToCoordinates3Div(db_nameAndCollection, portInput): from Step2ProcessTableOfTweets import processString from MongoDBInterface import getMongoClient errorCounts = 0 tokenToCoordinate = {} for pair in db_nameAndCollection: db_name = pair[0] collectionName = pair[1] client = getMongoClient(portInput) db = client[db_name] collectionToRead = db[collectionName] query = {} fields = {'tweetOriginal': 1, 'coordinatesPoint': 1, 'place': 1} tweetCursor = collectionToRead.find(query, fields, no_cursor_timeout=True) for userInfo in tweetCursor: label = None if 'coordinatesPoint' in userInfo: long = userInfo['coordinatesPoint']['coordinates'][0] if long <= -25: label = -1 elif long <= 65: label = 0 else: label = 1 elif 'place' in userInfo: longs = [] for coordinatePair in userInfo['place']['bounding_box'][ 'coordinates'][0]: longs.append(coordinatePair[0]) l1 = 0 l2 = 0 l3 = 0 for long in longs: if long <= -25: l1 += 1 elif long <= 65: l2 += 1 else: l3 += 1 if l1 == 4: label = -1 elif l2 == 4: label = 0 elif l3 == 4: label = 1 else: print(str(errorCounts) + " " + str(userInfo['place'])) errorCounts += 1 if label != None: tokens = processString(userInfo["tweetOriginal"]) for token in tokens: if not token in tokenToCoordinate: tokenToCoordinate[token] = [] tokenToCoordinate[token].append(label) db_name = "Temp_Analysis" db = client[db_name] collectionName = "TokenToCoordinates_Combined3Div" db[collectionName].drop() infoToWrite = [] import numpy as np for token in tokenToCoordinate: d = {} d["_id"] = token d["coordinates"] = tokenToCoordinate[token] infoToWrite.append(d) if len(infoToWrite) > 1000: try: db[collectionName].insert_many(infoToWrite, ordered=False) infoToWrite = [] print("Performed write") except: print("Error when doing bulk write") if len(infoToWrite) > 0: try: db[collectionName].insert_many(infoToWrite, ordered=False) infoToWrite = [] print("Performed write") except: print("Error when doing bulk write")