def test_append(self): self.crowd.append(self.cluster, test_time+timedelta(days=1)) self.assertEqual([GeneralMethods.getEpochFromDateTimeObject(test_time), GeneralMethods.getEpochFromDateTimeObject(test_time+timedelta(days=1))], sorted(self.crowd.clusters.keys())) self.assertEqual(StreamCluster, type(self.crowd.clusters[GeneralMethods.getEpochFromDateTimeObject(test_time)])) self.assertEqual(2, self.crowd.lifespan) self.assertEqual(getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(self.crowd.startTime)) self.assertEqual(getStringRepresentationForTweetTimestamp(test_time+timedelta(days=1)), getStringRepresentationForTweetTimestamp(self.crowd.endTime))
def dimensionsUpdateFrequencyEstimation(estimationObject, currentMessageTime): ''' Observe the new dimensions that get added to current dimension if the dimensions are being updated at regular intervals. For example, number of dimensions being added after 10m, 20m,... 5 horus. As time increases the number of 'decayed' dimensions increase. The current dimensions has a lot of unwanted decayed dimensions. Using this information identify the time interval that is best suited to refresh dimensions. Tentative: We decide to pick the time interval at which the rate of decay is maximum. ''' def updatePhraseScore(phraseObject): phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings) return phraseObject dimensions = estimationObject.stream_settings['dimensions'] newList = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)][:dimensions] print currentMessageTime, len(newList) if len(newList) >= dimensions: idsOfDimensionsListToCompare = [(i, GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i)) for i in estimationObject.dimensionUpdateTimeDeltas if GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i) in estimationObject.dimensionListsMap] dimensionsUpdateFrequency = {} for td, id in idsOfDimensionsListToCompare: oldList = estimationObject.dimensionListsMap[id] dimensionsUpdateFrequency[str(td.seconds)] = len(set(newList).difference(oldList)) print len(estimationObject.dimensionListsMap), currentMessageTime, len(newList), [(k, dimensionsUpdateFrequency[k]) for k in sorted(dimensionsUpdateFrequency)] iterationData = { 'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap), 'settings': pprint.pformat(estimationObject.stream_settings), ParameterEstimation.dimensionsUpdateFrequencyId:dimensionsUpdateFrequency } FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsUpdateFrequencyFile) estimationObject.dimensionListsMap[GeneralMethods.approximateToNearest5Minutes(currentMessageTime)] = newList[:] for key in estimationObject.dimensionListsMap.keys()[:]: if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[-1]: del estimationObject.dimensionListsMap[key]
def generate_tuo_location_and_tuo_neighbor_location_and_pure_influence_score(models_ids, startTime, endTime, outputFolder, hashtag_tag): for model_id in models_ids: # if w_extra_hashtags: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag) # else: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, wout_extra_hashtags_tag) output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag) GeneralMethods.runCommand('rm -rf %s'%output_file) for line_count, location_object in enumerate(iterateJsonFromFile( location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) )): print line_count, model_id tuo_neighbor_location_and_pure_influence_score = [] location_hashtag_set = set(location_object['hashtags']) for neighbor_location, mf_hashtag_to_tuo_occurrences_and_time_range in location_object['links'].iteritems(): pure_influence_scores = [] for hashtag, (neighbor_location_occurrences, time_range) in mf_hashtag_to_tuo_occurrences_and_time_range.iteritems(): if hashtag in location_object['hashtags']: location_occurrences = location_object['hashtags'][hashtag][0] pure_influence_scores.append(MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[model_id](location_occurrences, neighbor_location_occurrences)) neighbor_location_hashtag_set = set(mf_hashtag_to_tuo_occurrences_and_time_range.keys()) if hashtag_tag==w_extra_hashtags_tag: for hashtag in location_hashtag_set.difference(neighbor_location_hashtag_set): pure_influence_scores.append(1.0) for hashtag in neighbor_location_hashtag_set.difference(location_hashtag_set): pure_influence_scores.append(-1.0) mean_pure_influence_score = np.mean(pure_influence_scores) tuo_neighbor_location_and_pure_influence_score.append([neighbor_location, mean_pure_influence_score]) tuo_neighbor_location_and_pure_influence_score = sorted(tuo_neighbor_location_and_pure_influence_score, key=itemgetter(1)) FileIO.writeToFileAsJson([location_object['id'], tuo_neighbor_location_and_pure_influence_score], output_file)
def generate_tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity(model_ids, startTime, endTime, outputFolder): def location_similarity(location_vector_1, location_vector_2): return reduce(lambda total, k: total+(location_vector_1.get(k,0)*location_vector_2.get(k,0)), set(location_vector_1.keys()).union(location_vector_2.keys()),0.) influence_types=[InfluenceMeasuringModels.TYPE_COMPLETE_INFLUENCE, InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE, InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE] for model_id in model_ids: mf_location_to_mf_influence_type_to_influence_vector = dict(Experiments.load_tuo_location_and_mf_influence_type_to_influence_vector(model_id)) GeneralMethods.runCommand('rm -rf %s'%tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id) for line_count, location_object in enumerate(iterateJsonFromFile( location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) )): print line_count location = location_object['id'] tuo_neighbor_location_and_mf_influence_type_and_similarity = [] for neighbor_location in location_object['links'].keys(): mf_influence_type_and_similarity = {} for influence_type in influence_types: similarity = location_similarity( mf_location_to_mf_influence_type_to_influence_vector[location][influence_type], mf_location_to_mf_influence_type_to_influence_vector[neighbor_location][influence_type] ) mf_influence_type_and_similarity[influence_type] = similarity so_hashtags_for_location = set(location_object['hashtags'].keys()) so_hashtags_for_neighbor_location = set(location_object['links'][neighbor_location].keys()) numerator = len(so_hashtags_for_location.intersection(so_hashtags_for_neighbor_location)) + 0. denominator = len(so_hashtags_for_location.union(so_hashtags_for_neighbor_location)) + 0. mf_influence_type_and_similarity[JACCARD_SIMILARITY] = numerator/denominator tuo_neighbor_location_and_mf_influence_type_and_similarity.append([neighbor_location, mf_influence_type_and_similarity]) FileIO.writeToFileAsJson( [location, tuo_neighbor_location_and_mf_influence_type_and_similarity], tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id )
def generate_hashtag_specific_location_and_pure_influence_scores(test_models_ids): for test_model_id in test_models_ids: output_file = f_ltuo_hashtag_and_ltuo_location_and_pure_influence_score%(test_model_id) GeneralMethods.runCommand('rm -rf %s'%output_file) ltuo_hashtag_and_ltuo_location_and_occurrence_time = Experiments.load_ltuo_hashtag_and_ltuo_location_and_occurrence_time() for hashtag_count, (hashtag, ltuo_location_and_occurrence_time) in\ enumerate(ltuo_hashtag_and_ltuo_location_and_occurrence_time): ltuo_location_and_occurrence_times = [(location, sorted(zip(*ito_location_and_occurrence_time)[1])) for location, ito_location_and_occurrence_time in groupby( sorted(ltuo_location_and_occurrence_time, key=itemgetter(0)), key=itemgetter(0) ) ] print hashtag_count, test_model_id ltuo_location_and_pure_influence_score = [] for location, location_occurrence_times in ltuo_location_and_occurrence_times: pure_influence_scores = [] for neighbor_location, neighbor_location_occurrence_times in ltuo_location_and_occurrence_times: if location!=neighbor_location: pure_influence_score = MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[test_model_id](neighbor_location_occurrence_times, location_occurrence_times) pure_influence_scores.append(pure_influence_score) ltuo_location_and_pure_influence_score.append([location, np.mean(pure_influence_scores)]) ltuo_location_and_pure_influence_score = sorted(ltuo_location_and_pure_influence_score, key=itemgetter(1)) FileIO.writeToFileAsJson([hashtag, ltuo_location_and_pure_influence_score], output_file)
def combineLocationGraphs(graphMap, startingGraphId, startingTime, intervalInSeconds, linear=True, **kwargs): if intervalInSeconds%TIME_UNIT_IN_SECONDS==0 and int(intervalInSeconds/TIME_UNIT_IN_SECONDS)!=0: numberOfGraphs = int(intervalInSeconds/TIME_UNIT_IN_SECONDS) else: numberOfGraphs = int(intervalInSeconds/TIME_UNIT_IN_SECONDS)+1 graphId = GeneralMethods.approximateEpoch(GeneralMethods.getEpochFromDateTimeObject(startingTime), TIME_UNIT_IN_SECONDS) currentLogarithmicId = LocationGraphs.getLogarithmicGraphId(startingGraphId, graphId) currentCollectedGraphs = 0 graphIdsToCombine = [] while currentCollectedGraphs!=numberOfGraphs and currentLogarithmicId>0: numberOfGraphsToCollect = 2**int(math.log(numberOfGraphs-currentCollectedGraphs,2)) if not linear and currentLogarithmicId%2==0: indices = [1]+map(lambda j: 2**j, filter(lambda j: currentLogarithmicId%(2**j)==0, range(1, int(math.log(currentLogarithmicId+1,2))+1))) if max(indices)>numberOfGraphsToCollect and numberOfGraphsToCollect in indices: index = numberOfGraphsToCollect else: index = max(indices) else: index=1 logGraphId = '%s_%s'%(LocationGraphs.getGraphId(startingGraphId, currentLogarithmicId), index) if logGraphId in graphMap: graphIdsToCombine.append(logGraphId) currentLogarithmicId-=index currentCollectedGraphs+=index graphIdsToCombine = sorted(graphIdsToCombine, key=lambda id:int(id.split('_')[1]), reverse=True) # print graphIdsToCombine # for i in graphIdsToCombine: # ep, l = i.split('_') # print i, datetime.datetime.fromtimestamp(float(ep)), l, graphMap[i].number_of_nodes() graphsToCombine = [graphMap[id] for id in graphIdsToCombine] return combineGraphList(graphsToCombine, **kwargs)
def analyzeQuality(graphs, graphType): def getQualityScore(graphMap, edgesToKeep, timeDifference): dataToReturn = [] for j, intervalInSeconds in enumerate([1]): intervalInSeconds*=timeDifference linearGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=True, edgesToKeep=edgesToKeep) logGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=False, edgesToKeep=edgesToKeep) linearClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(linearGraph)[1], key=itemgetter(1)), key=itemgetter(1))] logarithmicClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(logGraph)[1], key=itemgetter(1)), key=itemgetter(1))] score = LocationGraphs.getClusterQualityScore(linearClusters, logarithmicClusters) print intervalInSeconds, edgesToKeep, score dataToReturn.append(score) return dataToReturn graphFile = qualityMetricsFolder%graphType print graphFile GeneralMethods.runCommand('rm -rf %s'%graphFile) for edgesToKeep in range(1,11): # for edgesToKeep in [1,10]: edgesToKeep*=0.1 graphMap = dict(graphs[:]) startingGraphId, endingGraphId = min(graphMap.keys()), max(graphMap.keys()) timeDifference = endingGraphId-startingGraphId LocationGraphs.updateLogarithmicGraphs(graphMap, edgesToKeep=edgesToKeep) # print {'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))} FileIO.writeToFileAsJson({'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))}, graphFile)
def trendCurves(): model = MixedUsersModel() experimentFileName = spamModelFolder+model.id conf = {'model': model, 'addUsersMethod': User.addUsersUsingRatio, 'analysisMethods': [(Analysis.trendCurves, 1)], 'ratio': {'normal': 0.985, 'spammer': 0.015}, 'experimentFileName': experimentFileName} GeneralMethods.runCommand('rm -rf %s'%experimentFileName); run(**conf) Analysis.trendCurves(experimentFileName=experimentFileName)
def mapper(self, key, hashtag_object): ltuo_occ_time_and_occ_location = hashtag_object['ltuo_occ_time_and_occ_location'] if ltuo_occ_time_and_occ_location: ltuo_intvl_time_and_occ_location = [( GeneralMethods.approximateEpoch(occ_time, TIME_UNIT_IN_SECONDS), occ_location ) for occ_time, occ_location in ltuo_occ_time_and_occ_location] ltuo_intvl_time_and_items =\ GeneralMethods.group_items_by(ltuo_intvl_time_and_occ_location, key=itemgetter(0)) ltuo_intvl_time_and_items.sort(key=itemgetter(0)) first_time = ltuo_intvl_time_and_items[0][0] intvl_method = lambda (t, it): ((t-first_time)/TIME_UNIT_IN_SECONDS, (t, len(it))) ltuo_iid_and_tuo_interval_and_occurrence_count = map(intvl_method, ltuo_intvl_time_and_items) peak_tuo_iid_and_tuo_interval_and_occurrence_count = \ max( ltuo_iid_and_tuo_interval_and_occurrence_count, key=lambda (_, (__, occurrence_count)): occurrence_count ) peak_iid = peak_tuo_iid_and_tuo_interval_and_occurrence_count[0] current_val = 0.0 total_occurrences = sum(data[1][1] for data in ltuo_iid_and_tuo_interval_and_occurrence_count) for iid, (_, occurrence_count) in ltuo_iid_and_tuo_interval_and_occurrence_count: is_peak = 0.0 if iid==peak_iid: is_peak=1.0 current_val+=occurrence_count yield iid, [is_peak, occurrence_count/total_occurrences, current_val/total_occurrences]
def mapper(self, key, hashtag_object): hashtag = hashtag_object['hashtag'] ltuo_occ_time_and_occ_location = hashtag_object['ltuo_occ_time_and_occ_location'] if ltuo_occ_time_and_occ_location: ltuo_intvl_time_and_occ_location = [( GeneralMethods.approximateEpoch(occ_time, TIME_UNIT_IN_SECONDS), occ_location ) for occ_time, occ_location in ltuo_occ_time_and_occ_location] points = [UTMConverter.getLatLongUTMIdInLatLongForm(loc) for _, loc in ltuo_occ_time_and_occ_location] ltuo_intvl_time_and_items =\ GeneralMethods.group_items_by(ltuo_intvl_time_and_occ_location, key=itemgetter(0)) ltuo_intvl_time_and_items.sort(key=itemgetter(0)) first_time = ltuo_intvl_time_and_items[0][0] ltuo_iid_and_occ_count = map(lambda (t, it): ((t-first_time)/TIME_UNIT_IN_SECONDS, len(it)), ltuo_intvl_time_and_items) ltuo_location_and_items =\ GeneralMethods.group_items_by(ltuo_intvl_time_and_occ_location, key=itemgetter(1)) mf_location_to_occ_count = dict(map(lambda (l, it): (l, len(it)), ltuo_location_and_items)) spatial_metrics = { 'hashtag': hashtag, 'num_of_occurrenes': len(ltuo_occ_time_and_occ_location), 'peak_iid': max(ltuo_iid_and_occ_count, key=itemgetter(1))[0], 'focus': focus(mf_location_to_occ_count), 'entropy': entropy(mf_location_to_occ_count, as_bits=False), 'spread': getRadiusOfGyration(points) } yield hashtag, spatial_metrics
def writeUserClustersFile(place): print 'Generating clusters...' userVectors = GenerateDataFiles.getUserVectors(place) GeneralMethods.runCommand('rm -rf %s'%placesUserClustersFile%place['name']) clusterAssignments = Clustering.cluster(Clustering.EM, placesARFFFile%place['name'], userVectors, '-N -1') # clusterAssignments = Clustering.cluster(Clustering.KMeans, placesARFFFile%place['name'], userVectors, '-N 2') for userId, userVector in userVectors.iteritems(): userVectors[userId] = {'userVector': userVector, 'clusterId': clusterAssignments[userId]} for data in userVectors.iteritems(): FileIO.writeToFileAsJson(data, placesUserClustersFile%place['name'])
def performanceWithSpamFilteringForLatestMessages(generateData): experimentData = defaultdict(dict) for iteration in range(10): # for spammerPercentage in range(1,21): ## spammerPercentage = 20 # spammerPercentage = spammerPercentage*0.05 # for spammerPercentage in range(1,11): # spammerPercentage = spammerPercentage*0.02 # for spammerPercentage in range(1,201): # spammerPercentage = spammerPercentage* 0.005 l1 = [spammerPercentage* 0.001 for spammerPercentage in range(1,51)] l2 = [spammerPercentage* 0.05 for spammerPercentage in range(1,21)] l3 = [0.01]+l2 for spammerPercentage in l1: experimentFileName = spamModelFolder+'performanceWithSpamFilteringForLatestMessages/%s/%0.3f'%(iteration,spammerPercentage) print experimentFileName if generateData: model = MixedUsersModel() conf = {'model': model, 'numberOfTimeSteps': 10, 'addUsersMethod': User.addUsersUsingRatio, 'analysisMethods': [(Analysis.measureRankingQuality, 1)], 'ratio': {'normal': 1-spammerPercentage, 'spammer': spammerPercentage}, 'rankingMethods':[RankingModel.latestMessages, RankingModel.latestMessagesSpamFiltered], 'experimentFileName': experimentFileName, # 'noOfPayloadsPerSpammer': 1, 'noOfTopics': 10 } # conf = {'model': model, 'numberOfTimeSteps': 10, 'addUsersMethod': User.addUsersUsingRatio, 'analysisMethods': [(Analysis.measureRankingQuality, 1)], 'ratio': {'normal': 1-spammerPercentage, 'spammer': spammerPercentage}, # 'rankingMethods':[RankingModel.latestMessages, RankingModel.latestMessagesDuplicatesRemoved, RankingModel.popularMessages], # 'experimentFileName': experimentFileName} GeneralMethods.runCommand('rm -rf %s'%experimentFileName);run(**conf) else: tempData = defaultdict(list) for data in FileIO.iterateJsonFromFile(experimentFileName): for ranking_id in data['spammmess']: tempData[ranking_id]+=data['spammmess'][ranking_id] experimentData[iteration][spammerPercentage]=tempData if not generateData: realDataY = defaultdict(dict) for iteration in experimentData: dataY = defaultdict(list) dataX = [] for perct in sorted(experimentData[iteration]): dataX.append(perct) for ranking_id, values in experimentData[iteration][perct].iteritems(): dataY[ranking_id].append(np.mean(values)) dataX=sorted(dataX) for ranking_id in dataY: for x, y in zip(dataX, dataY[ranking_id]): if x not in realDataY[ranking_id]: realDataY[ranking_id][x]=[] realDataY[ranking_id][x].append(y) for ranking_id in dataY: plt.plot(dataX, [np.mean(realDataY[ranking_id][x]) for x in dataX], label=labels[ranking_id], lw=1, marker=RankingModel.marker[ranking_id]) plt.xlabel('Percentage of Spammers', fontsize=16, fontweight='bold') plt.ylabel('Spamness', fontsize=16, fontweight='bold') # plt.title('Performance with spam filtering') plt.legend(loc=2) # plt.show() plt.xlim(xmax=0.05) plt.savefig('performanceWithSpamFilteringForLatestMessages.png') plt.clf()
def messageSelectionMethod(self, currentTimeStep, user, currentTopics, **conf): message = None if GeneralMethods.trueWith(user.messagingProbability): if GeneralMethods.trueWith(user.newTopicProbability): topic = Topic(len(currentTopics)) currentTopics.append(topic) message = user.generateMessage(currentTimeStep, topic) else: message = user.generateMessage(currentTimeStep, random.choice(currentTopics)) return message
def performanceWithSpamDetection(generateData): experimentData = defaultdict(dict) ratios = [0.0,0.4,0.9] marker = dict([(0.0, 's'), (0.4, 'o'), (0.9, 'd')]) # spammerPercentages = [0.2, 0.01, 0.01] spammerPercentages = [0.015, 0.015, 0.015] for iteration in range(10): for spamDetectionRatio, spammerPercentage in zip(ratios, spammerPercentages): experimentFileName = spamModelFolder+'performanceWithSpamDetection/%s/%0.3f'%(iteration,spamDetectionRatio) print experimentFileName if generateData: model = MixedUsersModel() conf = {'model': model, 'numberOfTimeSteps': 100, 'addUsersMethod': User.addUsersUsingRatioWithSpamDetection, 'analysisMethods': [(Analysis.measureRankingQuality, 1)], 'ratio': {'normal': 1-spammerPercentage, 'spammer': spammerPercentage}, # 'spammerMessagingProbability': spammerBudget, 'rankingMethods':[RankingModel.latestMessages, RankingModel.latestMessagesSpamFiltered, RankingModel.popularMessages, RankingModel.popularMessagesSpamFiltered], 'spamDetectionRatio': spamDetectionRatio, 'experimentFileName': experimentFileName} GeneralMethods.runCommand('rm -rf %s'%experimentFileName);run(**conf) else: for data in FileIO.iterateJsonFromFile(experimentFileName): for ranking_id in data['spammmess']: if data['currentTimeStep'] not in experimentData[spamDetectionRatio]: experimentData[spamDetectionRatio][data['currentTimeStep']]=defaultdict(list) experimentData[spamDetectionRatio][data['currentTimeStep']][ranking_id]+=data['spammmess'][ranking_id] if not generateData: sdr = {} for spamDetectionRatio in sorted(experimentData.keys()): dataToPlot = defaultdict(list) for timeUnit in experimentData[spamDetectionRatio]: dataToPlot['x'].append(timeUnit) for ranking_id in experimentData[spamDetectionRatio][timeUnit]: dataToPlot[ranking_id].append(np.mean(experimentData[spamDetectionRatio][timeUnit][ranking_id])) sdr[spamDetectionRatio]=dataToPlot for ranking_id in [RankingModel.LATEST_MESSAGES_SPAM_FILTERED, RankingModel.POPULAR_MESSAGES_SPAM_FILTERED]: # for ranking_id in [RankingModel.LATEST_MESSAGES, RankingModel.POPULAR_MESSAGES]: for spamDetectionRatio in ratios: print ranking_id, spamDetectionRatio dataY = smooth(sdr[spamDetectionRatio][ranking_id],8)[:len(sdr[spamDetectionRatio]['x'])] dataX, dataY = sdr[spamDetectionRatio]['x'][10:], dataY[10:] print 'x', [x-10 for x in dataX] if spamDetectionRatio==0.0: print ranking_id, dataY plt.plot([x-10 for x in dataX], dataY, label='%s'%(labels[ranking_id]), lw=1, marker=marker[spamDetectionRatio]) else: print ranking_id, dataY plt.plot([x-10 for x in dataX], dataY, label='%s (%d'%(labels[ranking_id].replace('Filtering', 'Detection'),spamDetectionRatio*100)+'%)', lw=1, marker=marker[spamDetectionRatio]) plt.ylim(ymin=0, ymax=1) plt.xlim(xmin=0, xmax=75) # plt.title(ranking_id) plt.legend() plt.xlabel('Time', fontsize=16, fontweight='bold') plt.ylabel('Spamness', fontsize=16, fontweight='bold') # plt.show() # plt.savefig('performanceWithSpamDetection_%s.png'%ranking_id) savefig('performanceWithSpamDetection_%s.png'%ranking_id) plt.clf()
def __init__(self, id): self.id = id self.totalCount = 0 self.countDistribution = defaultdict(int) self.age = 0 self.topicClass = random.choice(topicClasses) self.decayCoefficient = -3 if GeneralMethods.trueWith(0.05): self.stickiness = random.uniform(stickinessLowerThreshold, 1.0) else: self.stickiness = random.uniform(0.0, 0.1) self.payloads = PayLoad.generatePayloads(self.id, noOfPayloadsPerTopic) #Non-modeling attributes. self.color = GeneralMethods.getRandomColor()
def reducer(self, location, it_performance_values): performance_values = list(chain(*it_performance_values)) performance_summary = defaultdict(list) for prediction_method, pvs_for_prediction_method in \ GeneralMethods.group_items_by(performance_values, key=itemgetter('prediction_method')): for metric, pvs_for_prediction_method_and_metric in \ GeneralMethods.group_items_by(pvs_for_prediction_method, key=itemgetter('metric')): performance_summary[metric].append([ prediction_method, pvs_for_prediction_method_and_metric[0]['metric_value'] ]) yield '', dict(location=location, performance_summary=performance_summary)
def writeARFFFile(place): userVectors = defaultdict(dict) locationToUserMap = dict((l['location'], l) for l in locationToUserMapIterator(place, minCheckins=50)) for lid in locationToUserMap: for user in locationToUserMap[lid]['users']: userVectors[user][lid.replace(' ', '_')]=sum(len(locationToUserMap[lid]['users'][user][d][db]) for d in locationToUserMap[lid]['users'][user] for db in locationToUserMap[lid]['users'][user][d]) for user in userVectors.keys()[:]: if sum(userVectors[user].itervalues())<place['minUserCheckins']: del userVectors[user] arffFile=ARFF.writeARFFForClustering(userVectors, place['name']) outputFileName = getARFFFileName(place) FileIO.createDirectoryForFile(outputFileName) GeneralMethods.runCommand('mv %s %s'%(arffFile, outputFileName))
def messageSelectionMethod(self, currentTimeStep, user, currentTopics, **conf): if self.lastObservedTimeStep!=currentTimeStep: self._updateTopicProbabilities(currentTimeStep, currentTopics, **conf) message = None if GeneralMethods.trueWith(user.messagingProbability): if GeneralMethods.trueWith(user.newTopicProbability): topic = Topic(len(currentTopics)); currentTopics.append(topic); message=user.generateMessage(currentTimeStep, topic) else: if GeneralMethods.trueWith(user.probabilityOfPickingPopularTopic): if user.topicClass!=None: topicIndex = GeneralMethods.weightedChoice([i[1] for i in self.topicProbabilities[user.topicClass]]) topic = self.topicProbabilities[user.topicClass][topicIndex][0] message=user.generateMessage(currentTimeStep, topic) if not GeneralMethods.trueWith(topic.stickiness): message = None else: topicIndex = GeneralMethods.weightedChoice([i[1] for i in self.topTopics]) topic = self.topTopics[topicIndex][0] message=user.generateMessage(currentTimeStep, topic) else: if user.topicClass!=None: stickinesses = [topic[0].stickiness for topic in self.topicProbabilities[user.topicClass]] total_stickiness = sum(stickinesses) stickinesses = [s/total_stickiness for s in stickinesses] topicIndex = GeneralMethods.weightedChoice(stickinesses) topic = self.topicProbabilities[user.topicClass][topicIndex][0] # message=user.generateMessage(currentTimeStep, random.choice(self.topicProbabilities[user.topicClass])[0]) message=user.generateMessage(currentTimeStep, topic) else: # topicIndex = GeneralMethods.weightedChoice([i[1] for i in self.allTopics]) stickinesses = [topic[0].stickiness for tc in self.topicProbabilities for topic in self.topicProbabilities[tc]] total_stickiness = sum(stickinesses) stickinesses = [s/total_stickiness for s in stickinesses] topicIndex = GeneralMethods.weightedChoice(stickinesses) # print len(self.allTopics), len(stickinesses) topic = self.allTopics[topicIndex][0] message=user.generateMessage(currentTimeStep, topic) return message
def writeLocationToUserMap(place): name, boundary = place['name'], place['boundary'] GeneralMethods.runCommand('rm -rf %s'%placesLocationToUserMapFile%name) for location in filteredLocationToUserAndTimeMapIterator(minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, inputFile=locationToUserAndExactTimeMapFile): lid=getLocationFromLid(location['location']) if isWithinBoundingBox(lid, boundary): location['categories'] = ''; location['tags'] = ''; location['name']='' title = venuesCollection.find_one({'lid':location['location']}) if title: location['name'] = unicode(title['n']).encode("utf-8") meta = venuesMetaDataCollection.find_one({'_id':location['location']}) if meta: location['categories'] = unicode(meta['c']).encode("utf-8"); location['tags'] = unicode(meta['t']).encode("utf-8") for user in location['users'].keys()[:]: location['users'][str(user)]=location['users'][user]; del location['users'][user] location['noOfCheckins']=sum([len(epochs) for user, userVector in location['users'].iteritems() for day, dayVector in userVector.iteritems() for db, epochs in dayVector.iteritems()]) if location['noOfCheckins']>place.get('minLocationCheckins',0): FileIO.writeToFileAsJson(location, placesLocationToUserMapFile%name)
def writeTopClusterFeatures(place): locationNames = {} def getLocationName(lid): if lid not in locationNames: locationObject = venuesCollection.find_one({'lid':lid}) if locationObject: locationNames[lid] = unicode(locationObject['n']).encode("utf-8") else: locationNames[lid] = '' return locationNames[lid] GeneralMethods.runCommand('rm -rf %s'%placesUserClusterFeaturesFile%place['name']) documents = [userVector.values() for user, userVector in FileIO.iterateJsonFromFile(placesUserClustersFile%place['name'])] for data in getTopFeaturesForClass(documents, 1000): clusterId, features = data modifiedFeatures = [] for feature in features: modifiedFeatures.append(list(feature) + [getLocationName(feature[0].replace('_', ' '))]) FileIO.writeToFileAsJson([clusterId, GeneralMethods.getRandomColor(), modifiedFeatures], placesUserClusterFeaturesFile%place['name'])
def __init__(self, cluster, clusterFormationTime): self.crowdId = cluster.clusterId self.clusters = { GeneralMethods.getEpochFromDateTimeObject(clusterFormationTime): cluster } self.ends, self.inComingCrowds, self.outGoingCrowd = False, [], None
def mapper(self, key, value): if False: yield # I'm a generator! hashtag_object = cjson.decode(value) if 'num_of_occurrences' in hashtag_object and\ hashtag_object['num_of_occurrences'] >= MIN_HASHTAG_OCCURRENCES_FOR_PROPAGATION_ANALYSIS: ltuo_bucket_occ_time_and_occ_utm_id =\ map( lambda (t, utm_id): (GeneralMethods.approximateEpoch(t, TIME_UNIT_IN_SECONDS), utm_id), hashtag_object['ltuo_occ_time_and_occ_utm_id'] ) ltuo_bucket_occ_time_and_occ_utm_id.sort(key=itemgetter(1)) ltuo_utm_id_and_bucket_occ_times =\ [ (occ_utm_id,map(itemgetter(0), it_bucket_occ_time_and_occ_utm_id)) for occ_utm_id, it_bucket_occ_time_and_occ_utm_id in groupby(ltuo_bucket_occ_time_and_occ_utm_id, key=itemgetter(1)) ] ltuo_utm_id_and_bucket_occ_times =\ filter( lambda (_, occ_times): len(occ_times)>10, ltuo_utm_id_and_bucket_occ_times ) for _, bucket_occ_times in ltuo_utm_id_and_bucket_occ_times: gap_perct = 0.05 gaps = np.arange(gap_perct,1+gap_perct,gap_perct) bucket_occ_times = filter_outliers(bucket_occ_times) bucket_occ_times_at_gaps = get_items_at_gap(bucket_occ_times, gap_perct) start_time = float(bucket_occ_times_at_gaps[0]) life_time = bucket_occ_times_at_gaps[-1] - start_time if life_time>0: norm_num_of_occurrences =\ map(lambda t: int(((t-start_time)/life_time)*100), bucket_occ_times_at_gaps) for gap, norm_num_of_occurrence in zip(gaps, norm_num_of_occurrences): self.mf_gap_to_norm_num_of_occurrences['%0.2f'%gap]+=norm_num_of_occurrence
def ef_plot(): output_file = fld_data_analysis_results%GeneralMethods.get_method_id()+'.png' data = [d for d in FileIO.iterateJsonFromFile(f_hashtag_spatial_metrics, remove_params_dict=True)] ltuo_hashtag_and_entropy_and_focus = map(itemgetter('hashtag', 'entropy', 'focus'), data) mf_norm_focus_to_entropies = defaultdict(list) for _, entropy, (_, focus) in ltuo_hashtag_and_entropy_and_focus: mf_norm_focus_to_entropies[round(focus, 2)].append(entropy) plt.figure(num=None, figsize=(6,3)) x_focus, y_entropy = zip(*[(norm_focus, np.mean(entropies)) for norm_focus, entropies in mf_norm_focus_to_entropies.iteritems() if len(entropies)>0]) plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0) plt.scatter(x_focus, y_entropy, s=50, lw=0, c='k') plt.xlim(xmin=-0.1, xmax=1.1) plt.ylim(ymin=-1, ymax=9) plt.xlabel('Mean hashtag focus') plt.ylabel('Mean hashtag entropy') plt.grid(True) savefig(output_file) ltuo_hashtag_and_r_entropy_and_focus =\ sorted(ltuo_hashtag_and_entropy_and_focus, key=itemgetter(1), reverse=True) ltuo_hashtag_and_r_entropy_and_s_focus = sorted(ltuo_hashtag_and_r_entropy_and_focus, key=itemgetter(2)) hashtags = zip(*ltuo_hashtag_and_r_entropy_and_s_focus)[0] print list(hashtags[:20]) print list(reversed(hashtags))[:20]
def getOccuranesInHighestActiveRegion( hashtagObject, checkIfItFirstActiveRegion=False, timeUnit=TIME_UNIT_IN_SECONDS, maxLengthOfHighestActiveRegion=None, ): occurancesInActiveRegion, timeUnits = [], [] occurranceDistributionInEpochs = getOccurranceDistributionInEpochs(hashtagObject["oc"], fillInGaps=True) if occurranceDistributionInEpochs: timeUnits, timeSeries = zip(*sorted(occurranceDistributionInEpochs.iteritems(), key=itemgetter(0))) hashtagPropagatingRegion = max(getActiveRegions(timeSeries), key=itemgetter(2)) if not maxLengthOfHighestActiveRegion: validTimeUnits = [ timeUnits[i] for i in range(hashtagPropagatingRegion[0], hashtagPropagatingRegion[1] + 1) ] else: validTimeUnits = [ timeUnits[i] for i in range(hashtagPropagatingRegion[0], hashtagPropagatingRegion[1] + 1) ][:maxLengthOfHighestActiveRegion] occurancesInActiveRegion = [ (p, t) for p, t in hashtagObject["oc"] if GeneralMethods.approximateEpoch(t, timeUnit) in validTimeUnits ] if not checkIfItFirstActiveRegion: return occurancesInActiveRegion else: isFirstActiveRegion = False if timeUnits and timeUnits[0] == validTimeUnits[0]: isFirstActiveRegion = True return (occurancesInActiveRegion, isFirstActiveRegion)
def temporal_affinity_vs_distance(): output_file = fld_data_analysis_results%GeneralMethods.get_method_id() + '.png' DataAnalysis._plot_affinities('adoption_lag') plt.xlabel('Distance (miles)') plt.ylabel('Hashtag adoption lag (hours)') # plt.show() savefig(output_file)
def writeLocationsWithClusterInfoFile(place): GeneralMethods.runCommand('rm -rf %s'%placesLocationWithClusterInfoFile%place['name']) for clustering in iteraterUserClusterings(place): dataToWrite, userClusterMap = {}, {} for clusterId, users in clustering[2]['clusters'].iteritems(): for user in users: userClusterMap[user]=clusterId locationMap = defaultdict(dict) for location in locationToUserMapIterator(place): locationMap[location['location']] = {'name':unicode(location['name']).encode("utf-8"), 'checkins':defaultdict(list)} for user, userVector in location['users'].iteritems(): if user in userClusterMap: for day, dayVector in userVector.iteritems(): for db, epochs in dayVector.iteritems(): locationMap[location['location']]['checkins'][userClusterMap[user]]+=epochs dataToWrite[str(clustering[0])]=locationMap FileIO.writeToFileAsJson(dataToWrite,placesLocationWithClusterInfoFile%place['name'])
def getOccuranesInHighestActiveRegion(hashtagObject): def getActiveRegions(timeSeries): noOfZerosObserved, activeRegions = 0, [] currentRegion, occurancesForRegion = None, 0 for index, l in zip(range(len(timeSeries)),timeSeries): if l>0: if noOfZerosObserved>MIN_NO_OF_TIME_UNITS_IN_INACTIVE_REGION or index==0: currentRegion = [None, None, None] currentRegion[0] = index occurancesForRegion = 0 noOfZerosObserved = 0 occurancesForRegion+=l else: noOfZerosObserved+=1 if noOfZerosObserved>MIN_NO_OF_TIME_UNITS_IN_INACTIVE_REGION and currentRegion and currentRegion[1]==None: currentRegion[1] = index-MIN_NO_OF_TIME_UNITS_IN_INACTIVE_REGION-1 currentRegion[2] = occurancesForRegion activeRegions.append(currentRegion) if not activeRegions: activeRegions.append([0, len(timeSeries)-1, sum(timeSeries)]) else: currentRegion[1], currentRegion[2] = index, occurancesForRegion activeRegions.append(currentRegion) return activeRegions occurranceDistributionInEpochs = getOccurranceDistributionInEpochs(hashtagObject['oc']) startEpoch, endEpoch = min(occurranceDistributionInEpochs, key=itemgetter(0))[0], max(occurranceDistributionInEpochs, key=itemgetter(0))[0] dataX = range(startEpoch, endEpoch, TIME_UNIT_IN_SECONDS) occurranceDistributionInEpochs = dict(occurranceDistributionInEpochs) for x in dataX: if x not in occurranceDistributionInEpochs: occurranceDistributionInEpochs[x]=0 timeUnits, timeSeries = zip(*sorted(occurranceDistributionInEpochs.iteritems(), key=itemgetter(0))) # for k, v in zip(timeUnits, timeSeries): # print k, v hashtagPropagatingRegion = max(getActiveRegions(timeSeries), key=itemgetter(2)) validTimeUnits = [timeUnits[i] for i in range(hashtagPropagatingRegion[0], hashtagPropagatingRegion[1]+1)] return [(p,t) for p,t in hashtagObject['oc'] if GeneralMethods.approximateEpoch(t, TIME_UNIT_IN_SECONDS) in validTimeUnits]
def iid_vs_cumulative_distribution_and_peak_distribution(): TIME_UNIT_IN_SECONDS = 10.*60. output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.png' ltuo_iid_and_interval_stats = [data for data in FileIO.iterateJsonFromFile(f_iid_spatial_metrics, remove_params_dict=True)] ltuo_s_iid_and_interval_stats = sorted(ltuo_iid_and_interval_stats, key=itemgetter(0)) ltuo_s_iid_and_tuo_is_peak_and_cumulative_percentage_of_occurrences = [(data[0], (data[1][0], data[1][2])) for data in ltuo_s_iid_and_interval_stats] total_peaks = sum([data[1][0] for data in ltuo_s_iid_and_tuo_is_peak_and_cumulative_percentage_of_occurrences])+0.0 x_iids = [] y_is_peaks = [] z_cumulative_percentage_of_occurrencess = [] for (iid, (is_peak, cumulative_percentage_of_occurrences)) in ltuo_s_iid_and_tuo_is_peak_and_cumulative_percentage_of_occurrences[:100]: print (iid, (is_peak, cumulative_percentage_of_occurrences)) x_iids.append((iid+1)*TIME_UNIT_IN_SECONDS/60) y_is_peaks.append(is_peak/total_peaks) z_cumulative_percentage_of_occurrencess.append(cumulative_percentage_of_occurrences) plt.figure(num=None, figsize=(4.3,3)) plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0) plt.plot(x_iids, y_is_peaks, marker='o', c='k') plt.ylabel('Distribution of hashtags') plt.xlabel('Hashtag peak (minutes)') plt.grid(True) plt.xlim(xmax=600) savefig(output_file_format%'peaks') plt.clf() plt.figure(num=None, figsize=(6,3)) plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0) plt.plot(x_iids, z_cumulative_percentage_of_occurrencess, lw=0, marker='o', c='k') # plt.xlabel('Minutes') plt.ylabel('CDF of occurrences') plt.xlabel('Time (Minutes)') plt.grid(True) plt.xlim(xmax=600) savefig(output_file_format%'cdf_occurrences_peak')
def significant_nei_utm_ids(): output_folder = fld_google_drive_data_analysis%GeneralMethods.get_method_id()+'/%s.png' for i, data in enumerate(FileIO.iterateJsonFromFile(f_significant_nei_utm_ids, remove_params_dict=True)): utm_lat_long = UTMConverter.getLatLongUTMIdInLatLongForm(data['utm_id']) nei_utm_lat_longs = map( lambda nei_utm_id: UTMConverter.getLatLongUTMIdInLatLongForm(nei_utm_id), data['nei_utm_ids'] ) if nei_utm_lat_longs: output_file = output_folder%('%s_%s'%(utm_lat_long)) plotPointsOnWorldMap(nei_utm_lat_longs, blueMarble=False, bkcolor='#CFCFCF', lw = 0, color = '#EA00FF', alpha=1.) _, m = plotPointsOnWorldMap([utm_lat_long], blueMarble=False, bkcolor='#CFCFCF', lw = 0, color = '#2BFF00', s = 40, returnBaseMapObject=True, alpha=1.) for nei_utm_lat_long in nei_utm_lat_longs: m.drawgreatcircle(utm_lat_long[1], utm_lat_long[0], nei_utm_lat_long[1], nei_utm_lat_long[0], color='#FFA600', lw=1.5, alpha=1.0) print 'Saving %s'%(i+1) savefig(output_file)
def getStreamStats(streamTweetsIterator): ''' 30-day Experts stats: # of users: 4804 # of tweets: 1614510 # of tweets per tu (mean, var): 186.497631974 7860.12570191 Houston stats # of users: 107494 # of tweets: 15946768 # of tweets per tu (mean, var): 1730.33506944 4834419.37341 10-day Experts stats # of users: 4674 # of tweets: 608798 # of tweets per tu (mean, var): 190.726190476 8132.75460228 Houston stats # of users: 39618 # of tweets: 2139829 # of tweets per tu (mean, var): 619.163483796 94450.7334004 ''' numberOfTweets, users, distributionPerTU = 0, set(), defaultdict(int) for tweet in streamTweetsIterator: users.add(tweet['user']['screen_name']) distributionPerTU[GeneralMethods.getEpochFromDateTimeObject(getDateTimeObjectFromTweetTimestamp(tweet['created_at']))//300]+=1 numberOfTweets+=1 print '# of users: ', len(users) print '# of tweets: ', numberOfTweets print '# of tweets per tu (mean, var): ', np.mean(distributionPerTU.values()), np.var(distributionPerTU.values())
def content_affinity_vs_distance(): output_file = fld_data_analysis_results%GeneralMethods.get_method_id() + '.png' DataAnalysis._plot_affinities('similarity') plt.xlabel('Distance (miles)') plt.ylabel('Hashtags sharing similarity') # plt.show() savefig(output_file)
def addLocationPointsWithTitles(self, points, color=None): if not color: color = GeneralMethods.getRandomColor() for point, title in ((list(reversed(point)), title) for point, title in points): pnt = self.kml.newpoint(description=title, coords=[point]) pnt.iconstyle.icon.href = "http://maps.google.com/mapfiles/kml/shapes/shaded_dot.png" pnt.iconstyle.color = "ff" + color[1:]
def test_append(self): self.crowd.append(self.cluster, test_time + timedelta(days=1)) self.assertEqual([ GeneralMethods.getEpochFromDateTimeObject(test_time), GeneralMethods.getEpochFromDateTimeObject(test_time + timedelta(days=1)) ], sorted(self.crowd.clusters.keys())) self.assertEqual( StreamCluster, type(self.crowd.clusters[GeneralMethods.getEpochFromDateTimeObject( test_time)])) self.assertEqual(2, self.crowd.lifespan) self.assertEqual( getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(self.crowd.startTime)) self.assertEqual( getStringRepresentationForTweetTimestamp(test_time + timedelta(days=1)), getStringRepresentationForTweetTimestamp(self.crowd.endTime))
def sampleCrowds(self): # Set dates for experts as startingDay=datetime(2011,3,19), endingDay=datetime(2011,3, 30) with a minimum of 7 users at a time. AnalyzeData.reset(), AnalyzeData.constructCrowdDataStructures(self.stream_settings['data_iterator']) fig = plt.figure(); ax = fig.gca() # expectedTags = set(['#redsox', '#mlb', '#sfgiants', '#49ers', '#mariners', '#twins', '#springtraining', '#mets', '#reds']) # expectedTags = set(['#ctia']); title = 'CTIA 2011' # expectedTags = set(['#55', '#hcr', '#hcrbday', '#oklahomas', '#aca', '#hcworks', '#npr', '#teaparty']) # expectedTags = set(['#budget11', '#taxdodgers', '#budget', '#pmqs', '#budget11', '#indybudget']) # expectedTags = set(['#egypt2dc', '#libyan', '#yemen', '#egypt', '#syria', '#gaddaficrimes', '#damascus', '#jan25', # '#daraa', '#feb17', '#gaddafi', '#libya', '#feb17', '#gadhafi', '#muslimbrotherhood', '#gaddafis']); title = 'Middle East' expectedTags = set(['#libya']); title = 'Libya' for crowd in self._filteredCrowdIterator(): if expectedTags.intersection(set(list(crowd.hashtagDimensions))): x, y = zip(*[(datetime.fromtimestamp(clusterGenerationTime), len(crowd.clusters[clusterGenerationTime].documentsInCluster)) for clusterGenerationTime in sorted(crowd.clusters)]) plt.plot_date(x, y, '-', color=GeneralMethods.getRandomColor(), lw=2, label=' '.join([crowd.crowdId]+list(crowd.hashtagDimensions)[:1])) fig.autofmt_xdate(rotation=30) ax.xaxis.set_major_locator(matplotlib.dates.HourLocator(interval=24)) ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%a %d %b')) # plt.legend() plt.xlim((datetime(2011, 3, 19), datetime(2011, 3, 30))) plt.title(getLatexForString('Crowds for '+title)) plt.ylabel(getLatexForString('Crowd size')) plt.show()
def iteratePhrases(): for tweet in TweetFiles.iterateTweetsFromGzip('/mnt/chevron/kykamath/data/twitter/tweets_by_trends/2011_2_6.gz'): message = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(tweet, **settings) if message.vector: for phrase in message.vector: if phrase!='': yield (phrase, GeneralMethods.approximateEpoch(GeneralMethods.getEpochFromDateTimeObject(message.timeStamp), 60))
def append(self, cluster, clusterFormationTime): self.clusters[GeneralMethods.getEpochFromDateTimeObject( clusterFormationTime)] = cluster
def copy_file(input_file, output_file): command = 'cp %s %s' % (input_file, output_file) GeneralMethods.runCommand(command)