def generate_tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity(model_ids, startTime, endTime, outputFolder): def location_similarity(location_vector_1, location_vector_2): return reduce(lambda total, k: total+(location_vector_1.get(k,0)*location_vector_2.get(k,0)), set(location_vector_1.keys()).union(location_vector_2.keys()),0.) influence_types=[InfluenceMeasuringModels.TYPE_COMPLETE_INFLUENCE, InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE, InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE] for model_id in model_ids: mf_location_to_mf_influence_type_to_influence_vector = dict(Experiments.load_tuo_location_and_mf_influence_type_to_influence_vector(model_id)) GeneralMethods.runCommand('rm -rf %s'%tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id) for line_count, location_object in enumerate(iterateJsonFromFile( location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) )): print line_count location = location_object['id'] tuo_neighbor_location_and_mf_influence_type_and_similarity = [] for neighbor_location in location_object['links'].keys(): mf_influence_type_and_similarity = {} for influence_type in influence_types: similarity = location_similarity( mf_location_to_mf_influence_type_to_influence_vector[location][influence_type], mf_location_to_mf_influence_type_to_influence_vector[neighbor_location][influence_type] ) mf_influence_type_and_similarity[influence_type] = similarity so_hashtags_for_location = set(location_object['hashtags'].keys()) so_hashtags_for_neighbor_location = set(location_object['links'][neighbor_location].keys()) numerator = len(so_hashtags_for_location.intersection(so_hashtags_for_neighbor_location)) + 0. denominator = len(so_hashtags_for_location.union(so_hashtags_for_neighbor_location)) + 0. mf_influence_type_and_similarity[JACCARD_SIMILARITY] = numerator/denominator tuo_neighbor_location_and_mf_influence_type_and_similarity.append([neighbor_location, mf_influence_type_and_similarity]) FileIO.writeToFileAsJson( [location, tuo_neighbor_location_and_mf_influence_type_and_similarity], tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id )
def generate_tuo_location_and_tuo_neighbor_location_and_pure_influence_score(models_ids, startTime, endTime, outputFolder, hashtag_tag): for model_id in models_ids: # if w_extra_hashtags: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag) # else: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, wout_extra_hashtags_tag) output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag) GeneralMethods.runCommand('rm -rf %s'%output_file) for line_count, location_object in enumerate(iterateJsonFromFile( location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) )): print line_count, model_id tuo_neighbor_location_and_pure_influence_score = [] location_hashtag_set = set(location_object['hashtags']) for neighbor_location, mf_hashtag_to_tuo_occurrences_and_time_range in location_object['links'].iteritems(): pure_influence_scores = [] for hashtag, (neighbor_location_occurrences, time_range) in mf_hashtag_to_tuo_occurrences_and_time_range.iteritems(): if hashtag in location_object['hashtags']: location_occurrences = location_object['hashtags'][hashtag][0] pure_influence_scores.append(MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[model_id](location_occurrences, neighbor_location_occurrences)) neighbor_location_hashtag_set = set(mf_hashtag_to_tuo_occurrences_and_time_range.keys()) if hashtag_tag==w_extra_hashtags_tag: for hashtag in location_hashtag_set.difference(neighbor_location_hashtag_set): pure_influence_scores.append(1.0) for hashtag in neighbor_location_hashtag_set.difference(location_hashtag_set): pure_influence_scores.append(-1.0) mean_pure_influence_score = np.mean(pure_influence_scores) tuo_neighbor_location_and_pure_influence_score.append([neighbor_location, mean_pure_influence_score]) tuo_neighbor_location_and_pure_influence_score = sorted(tuo_neighbor_location_and_pure_influence_score, key=itemgetter(1)) FileIO.writeToFileAsJson([location_object['id'], tuo_neighbor_location_and_pure_influence_score], output_file)
def trendCurves(): model = MixedUsersModel() experimentFileName = spamModelFolder+model.id conf = {'model': model, 'addUsersMethod': User.addUsersUsingRatio, 'analysisMethods': [(Analysis.trendCurves, 1)], 'ratio': {'normal': 0.985, 'spammer': 0.015}, 'experimentFileName': experimentFileName} GeneralMethods.runCommand('rm -rf %s'%experimentFileName); run(**conf) Analysis.trendCurves(experimentFileName=experimentFileName)
def analyzeQuality(graphs, graphType): def getQualityScore(graphMap, edgesToKeep, timeDifference): dataToReturn = [] for j, intervalInSeconds in enumerate([1]): intervalInSeconds*=timeDifference linearGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=True, edgesToKeep=edgesToKeep) logGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=False, edgesToKeep=edgesToKeep) linearClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(linearGraph)[1], key=itemgetter(1)), key=itemgetter(1))] logarithmicClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(logGraph)[1], key=itemgetter(1)), key=itemgetter(1))] score = LocationGraphs.getClusterQualityScore(linearClusters, logarithmicClusters) print intervalInSeconds, edgesToKeep, score dataToReturn.append(score) return dataToReturn graphFile = qualityMetricsFolder%graphType print graphFile GeneralMethods.runCommand('rm -rf %s'%graphFile) for edgesToKeep in range(1,11): # for edgesToKeep in [1,10]: edgesToKeep*=0.1 graphMap = dict(graphs[:]) startingGraphId, endingGraphId = min(graphMap.keys()), max(graphMap.keys()) timeDifference = endingGraphId-startingGraphId LocationGraphs.updateLogarithmicGraphs(graphMap, edgesToKeep=edgesToKeep) # print {'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))} FileIO.writeToFileAsJson({'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))}, graphFile)
def generate_hashtag_specific_location_and_pure_influence_scores(test_models_ids): for test_model_id in test_models_ids: output_file = f_ltuo_hashtag_and_ltuo_location_and_pure_influence_score%(test_model_id) GeneralMethods.runCommand('rm -rf %s'%output_file) ltuo_hashtag_and_ltuo_location_and_occurrence_time = Experiments.load_ltuo_hashtag_and_ltuo_location_and_occurrence_time() for hashtag_count, (hashtag, ltuo_location_and_occurrence_time) in\ enumerate(ltuo_hashtag_and_ltuo_location_and_occurrence_time): ltuo_location_and_occurrence_times = [(location, sorted(zip(*ito_location_and_occurrence_time)[1])) for location, ito_location_and_occurrence_time in groupby( sorted(ltuo_location_and_occurrence_time, key=itemgetter(0)), key=itemgetter(0) ) ] print hashtag_count, test_model_id ltuo_location_and_pure_influence_score = [] for location, location_occurrence_times in ltuo_location_and_occurrence_times: pure_influence_scores = [] for neighbor_location, neighbor_location_occurrence_times in ltuo_location_and_occurrence_times: if location!=neighbor_location: pure_influence_score = MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[test_model_id](neighbor_location_occurrence_times, location_occurrence_times) pure_influence_scores.append(pure_influence_score) ltuo_location_and_pure_influence_score.append([location, np.mean(pure_influence_scores)]) ltuo_location_and_pure_influence_score = sorted(ltuo_location_and_pure_influence_score, key=itemgetter(1)) FileIO.writeToFileAsJson([hashtag, ltuo_location_and_pure_influence_score], output_file)
def writeUserClustersFile(place): print 'Generating clusters...' userVectors = GenerateDataFiles.getUserVectors(place) GeneralMethods.runCommand('rm -rf %s'%placesUserClustersFile%place['name']) clusterAssignments = Clustering.cluster(Clustering.EM, placesARFFFile%place['name'], userVectors, '-N -1') # clusterAssignments = Clustering.cluster(Clustering.KMeans, placesARFFFile%place['name'], userVectors, '-N 2') for userId, userVector in userVectors.iteritems(): userVectors[userId] = {'userVector': userVector, 'clusterId': clusterAssignments[userId]} for data in userVectors.iteritems(): FileIO.writeToFileAsJson(data, placesUserClustersFile%place['name'])
def performanceWithSpamFilteringForLatestMessages(generateData): experimentData = defaultdict(dict) for iteration in range(10): # for spammerPercentage in range(1,21): ## spammerPercentage = 20 # spammerPercentage = spammerPercentage*0.05 # for spammerPercentage in range(1,11): # spammerPercentage = spammerPercentage*0.02 # for spammerPercentage in range(1,201): # spammerPercentage = spammerPercentage* 0.005 l1 = [spammerPercentage* 0.001 for spammerPercentage in range(1,51)] l2 = [spammerPercentage* 0.05 for spammerPercentage in range(1,21)] l3 = [0.01]+l2 for spammerPercentage in l1: experimentFileName = spamModelFolder+'performanceWithSpamFilteringForLatestMessages/%s/%0.3f'%(iteration,spammerPercentage) print experimentFileName if generateData: model = MixedUsersModel() conf = {'model': model, 'numberOfTimeSteps': 10, 'addUsersMethod': User.addUsersUsingRatio, 'analysisMethods': [(Analysis.measureRankingQuality, 1)], 'ratio': {'normal': 1-spammerPercentage, 'spammer': spammerPercentage}, 'rankingMethods':[RankingModel.latestMessages, RankingModel.latestMessagesSpamFiltered], 'experimentFileName': experimentFileName, # 'noOfPayloadsPerSpammer': 1, 'noOfTopics': 10 } # conf = {'model': model, 'numberOfTimeSteps': 10, 'addUsersMethod': User.addUsersUsingRatio, 'analysisMethods': [(Analysis.measureRankingQuality, 1)], 'ratio': {'normal': 1-spammerPercentage, 'spammer': spammerPercentage}, # 'rankingMethods':[RankingModel.latestMessages, RankingModel.latestMessagesDuplicatesRemoved, RankingModel.popularMessages], # 'experimentFileName': experimentFileName} GeneralMethods.runCommand('rm -rf %s'%experimentFileName);run(**conf) else: tempData = defaultdict(list) for data in FileIO.iterateJsonFromFile(experimentFileName): for ranking_id in data['spammmess']: tempData[ranking_id]+=data['spammmess'][ranking_id] experimentData[iteration][spammerPercentage]=tempData if not generateData: realDataY = defaultdict(dict) for iteration in experimentData: dataY = defaultdict(list) dataX = [] for perct in sorted(experimentData[iteration]): dataX.append(perct) for ranking_id, values in experimentData[iteration][perct].iteritems(): dataY[ranking_id].append(np.mean(values)) dataX=sorted(dataX) for ranking_id in dataY: for x, y in zip(dataX, dataY[ranking_id]): if x not in realDataY[ranking_id]: realDataY[ranking_id][x]=[] realDataY[ranking_id][x].append(y) for ranking_id in dataY: plt.plot(dataX, [np.mean(realDataY[ranking_id][x]) for x in dataX], label=labels[ranking_id], lw=1, marker=RankingModel.marker[ranking_id]) plt.xlabel('Percentage of Spammers', fontsize=16, fontweight='bold') plt.ylabel('Spamness', fontsize=16, fontweight='bold') # plt.title('Performance with spam filtering') plt.legend(loc=2) # plt.show() plt.xlim(xmax=0.05) plt.savefig('performanceWithSpamFilteringForLatestMessages.png') plt.clf()
def performanceWithSpamDetection(generateData): experimentData = defaultdict(dict) ratios = [0.0,0.4,0.9] marker = dict([(0.0, 's'), (0.4, 'o'), (0.9, 'd')]) # spammerPercentages = [0.2, 0.01, 0.01] spammerPercentages = [0.015, 0.015, 0.015] for iteration in range(10): for spamDetectionRatio, spammerPercentage in zip(ratios, spammerPercentages): experimentFileName = spamModelFolder+'performanceWithSpamDetection/%s/%0.3f'%(iteration,spamDetectionRatio) print experimentFileName if generateData: model = MixedUsersModel() conf = {'model': model, 'numberOfTimeSteps': 100, 'addUsersMethod': User.addUsersUsingRatioWithSpamDetection, 'analysisMethods': [(Analysis.measureRankingQuality, 1)], 'ratio': {'normal': 1-spammerPercentage, 'spammer': spammerPercentage}, # 'spammerMessagingProbability': spammerBudget, 'rankingMethods':[RankingModel.latestMessages, RankingModel.latestMessagesSpamFiltered, RankingModel.popularMessages, RankingModel.popularMessagesSpamFiltered], 'spamDetectionRatio': spamDetectionRatio, 'experimentFileName': experimentFileName} GeneralMethods.runCommand('rm -rf %s'%experimentFileName);run(**conf) else: for data in FileIO.iterateJsonFromFile(experimentFileName): for ranking_id in data['spammmess']: if data['currentTimeStep'] not in experimentData[spamDetectionRatio]: experimentData[spamDetectionRatio][data['currentTimeStep']]=defaultdict(list) experimentData[spamDetectionRatio][data['currentTimeStep']][ranking_id]+=data['spammmess'][ranking_id] if not generateData: sdr = {} for spamDetectionRatio in sorted(experimentData.keys()): dataToPlot = defaultdict(list) for timeUnit in experimentData[spamDetectionRatio]: dataToPlot['x'].append(timeUnit) for ranking_id in experimentData[spamDetectionRatio][timeUnit]: dataToPlot[ranking_id].append(np.mean(experimentData[spamDetectionRatio][timeUnit][ranking_id])) sdr[spamDetectionRatio]=dataToPlot for ranking_id in [RankingModel.LATEST_MESSAGES_SPAM_FILTERED, RankingModel.POPULAR_MESSAGES_SPAM_FILTERED]: # for ranking_id in [RankingModel.LATEST_MESSAGES, RankingModel.POPULAR_MESSAGES]: for spamDetectionRatio in ratios: print ranking_id, spamDetectionRatio dataY = smooth(sdr[spamDetectionRatio][ranking_id],8)[:len(sdr[spamDetectionRatio]['x'])] dataX, dataY = sdr[spamDetectionRatio]['x'][10:], dataY[10:] print 'x', [x-10 for x in dataX] if spamDetectionRatio==0.0: print ranking_id, dataY plt.plot([x-10 for x in dataX], dataY, label='%s'%(labels[ranking_id]), lw=1, marker=marker[spamDetectionRatio]) else: print ranking_id, dataY plt.plot([x-10 for x in dataX], dataY, label='%s (%d'%(labels[ranking_id].replace('Filtering', 'Detection'),spamDetectionRatio*100)+'%)', lw=1, marker=marker[spamDetectionRatio]) plt.ylim(ymin=0, ymax=1) plt.xlim(xmin=0, xmax=75) # plt.title(ranking_id) plt.legend() plt.xlabel('Time', fontsize=16, fontweight='bold') plt.ylabel('Spamness', fontsize=16, fontweight='bold') # plt.show() # plt.savefig('performanceWithSpamDetection_%s.png'%ranking_id) savefig('performanceWithSpamDetection_%s.png'%ranking_id) plt.clf()
def writeARFFFile(place): userVectors = defaultdict(dict) locationToUserMap = dict((l['location'], l) for l in locationToUserMapIterator(place, minCheckins=50)) for lid in locationToUserMap: for user in locationToUserMap[lid]['users']: userVectors[user][lid.replace(' ', '_')]=sum(len(locationToUserMap[lid]['users'][user][d][db]) for d in locationToUserMap[lid]['users'][user] for db in locationToUserMap[lid]['users'][user][d]) for user in userVectors.keys()[:]: if sum(userVectors[user].itervalues())<place['minUserCheckins']: del userVectors[user] arffFile=ARFF.writeARFFForClustering(userVectors, place['name']) outputFileName = getARFFFileName(place) FileIO.createDirectoryForFile(outputFileName) GeneralMethods.runCommand('mv %s %s'%(arffFile, outputFileName))
def writeLocationToUserMap(place): name, boundary = place['name'], place['boundary'] GeneralMethods.runCommand('rm -rf %s'%placesLocationToUserMapFile%name) for location in filteredLocationToUserAndTimeMapIterator(minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, inputFile=locationToUserAndExactTimeMapFile): lid=getLocationFromLid(location['location']) if isWithinBoundingBox(lid, boundary): location['categories'] = ''; location['tags'] = ''; location['name']='' title = venuesCollection.find_one({'lid':location['location']}) if title: location['name'] = unicode(title['n']).encode("utf-8") meta = venuesMetaDataCollection.find_one({'_id':location['location']}) if meta: location['categories'] = unicode(meta['c']).encode("utf-8"); location['tags'] = unicode(meta['t']).encode("utf-8") for user in location['users'].keys()[:]: location['users'][str(user)]=location['users'][user]; del location['users'][user] location['noOfCheckins']=sum([len(epochs) for user, userVector in location['users'].iteritems() for day, dayVector in userVector.iteritems() for db, epochs in dayVector.iteritems()]) if location['noOfCheckins']>place.get('minLocationCheckins',0): FileIO.writeToFileAsJson(location, placesLocationToUserMapFile%name)
def writeTopClusterFeatures(place): locationNames = {} def getLocationName(lid): if lid not in locationNames: locationObject = venuesCollection.find_one({'lid':lid}) if locationObject: locationNames[lid] = unicode(locationObject['n']).encode("utf-8") else: locationNames[lid] = '' return locationNames[lid] GeneralMethods.runCommand('rm -rf %s'%placesUserClusterFeaturesFile%place['name']) documents = [userVector.values() for user, userVector in FileIO.iterateJsonFromFile(placesUserClustersFile%place['name'])] for data in getTopFeaturesForClass(documents, 1000): clusterId, features = data modifiedFeatures = [] for feature in features: modifiedFeatures.append(list(feature) + [getLocationName(feature[0].replace('_', ' '))]) FileIO.writeToFileAsJson([clusterId, GeneralMethods.getRandomColor(), modifiedFeatures], placesUserClusterFeaturesFile%place['name'])
def writeLocationsWithClusterInfoFile(place): GeneralMethods.runCommand('rm -rf %s'%placesLocationWithClusterInfoFile%place['name']) for clustering in iteraterUserClusterings(place): dataToWrite, userClusterMap = {}, {} for clusterId, users in clustering[2]['clusters'].iteritems(): for user in users: userClusterMap[user]=clusterId locationMap = defaultdict(dict) for location in locationToUserMapIterator(place): locationMap[location['location']] = {'name':unicode(location['name']).encode("utf-8"), 'checkins':defaultdict(list)} for user, userVector in location['users'].iteritems(): if user in userClusterMap: for day, dayVector in userVector.iteritems(): for db, epochs in dayVector.iteritems(): locationMap[location['location']]['checkins'][userClusterMap[user]]+=epochs dataToWrite[str(clustering[0])]=locationMap FileIO.writeToFileAsJson(dataToWrite,placesLocationWithClusterInfoFile%place['name'])
def performanceAsPercentageOfGlobalSpammerVaries(generateData): experimentData = defaultdict(dict) for iteration in range(10): # for spammerPercentage in range(1,21): # spammerPercentage = spammerPercentage*0.05 for spammerPercentage in range(1,11): spammerPercentage = spammerPercentage*0.1 experimentFileName = spamModelFolder+'performanceAsPercentageOfGlobalSpammerVaries/%s/%0.3f'%(iteration,spammerPercentage) print experimentFileName if generateData: model = MixedUsersModel() conf = {'model': model, 'numberOfTimeSteps': 10, 'addUsersMethod': User.addUsersUsingRatio, 'analysisMethods': [(Analysis.measureRankingQuality, 1)], 'ratio': {'normal': 0.985, 'spammer': 0.015}, 'spamRatio': {'localPayloads': 1-spammerPercentage, 'globalPayloads': spammerPercentage}, 'noOfGlobalSpammerPayloads': 10, 'rankingMethods':[RankingModel.latestMessages, RankingModel.latestMessagesDuplicatesRemoved, RankingModel.popularMessages], 'experimentFileName': experimentFileName} GeneralMethods.runCommand('rm -rf %s'%experimentFileName);run(**conf) else: tempData = defaultdict(list) for data in FileIO.iterateJsonFromFile(experimentFileName): for ranking_id in data['spammmess']: tempData[ranking_id]+=data['spammmess'][ranking_id] experimentData[iteration][spammerPercentage]=tempData if not generateData: realDataY = defaultdict(dict) for iteration in experimentData: dataY = defaultdict(list) dataX = [] for perct in sorted(experimentData[iteration]): dataX.append(perct) for ranking_id, values in experimentData[iteration][perct].iteritems(): dataY[ranking_id].append(np.mean(values)) dataX=sorted(dataX) for ranking_id in dataY: for x, y in zip(dataX, dataY[ranking_id]): if x not in realDataY[ranking_id]: realDataY[ranking_id][x]=[] realDataY[ranking_id][x].append(y) for ranking_id in dataY: if ranking_id in labels: plt.plot(dataX, [np.mean(realDataY[ranking_id][x]) for x in dataX], label=labels[ranking_id], lw=1, marker=RankingModel.marker[ranking_id]) plt.xlabel('Percentage of Spammers Using Group Strategy', fontsize=16, fontweight='bold') plt.ylabel('Spamness', fontsize=16, fontweight='bold') # plt.title('Spammness when spammers use mixed strategy') plt.legend(loc=4) # plt.show() plt.savefig('performanceAsPercentageOfGlobalSpammerVaries.png') plt.clf()
def performanceAsNoOfGlobalPayloadsVary(generateData): experimentData = defaultdict(dict) for iteration in range(10): for noOfGlobalSpammerPayloads in range(1,500): # for noOfGlobalSpammerPayloads in range(10,11): Spammer.globalPayloads = None experimentFileName = spamModelFolder+'performanceAsNoOfGlobalPayloadsVary/%s/%0.3f'%(iteration,noOfGlobalSpammerPayloads) print experimentFileName if generateData: model = MixedUsersModel() conf = {'model': model, 'numberOfTimeSteps': 10, 'addUsersMethod': User.addUsersUsingRatio, 'analysisMethods': [(Analysis.measureRankingQuality, 1)], 'ratio': {'normal': 0.985, 'spammer': 0.015}, 'noOfGlobalSpammerPayloads': noOfGlobalSpammerPayloads, 'rankingMethods':[RankingModel.latestMessages, RankingModel.latestMessagesDuplicatesRemoved, RankingModel.popularMessages], 'experimentFileName': experimentFileName} GeneralMethods.runCommand('rm -rf %s'%experimentFileName);run(**conf) else: tempData = defaultdict(list) for data in FileIO.iterateJsonFromFile(experimentFileName): for ranking_id in data['spammmess']: tempData[ranking_id]+=data['spammmess'][ranking_id] experimentData[iteration][noOfGlobalSpammerPayloads]=tempData if not generateData: realDataY = defaultdict(dict) for iteration in experimentData: dataY = defaultdict(list) dataX = [] for perct in sorted(experimentData[iteration]): dataX.append(perct) for ranking_id, values in experimentData[iteration][perct].iteritems(): dataY[ranking_id].append(np.mean(values)) dataX=sorted(dataX) for ranking_id in dataY: for x, y in zip(dataX, dataY[ranking_id]): if x not in realDataY[ranking_id]: realDataY[ranking_id][x]=[] realDataY[ranking_id][x].append(y) for ranking_id in dataY: if ranking_id in labels: dy = [np.mean(realDataY[ranking_id][x]) for x in dataX[:20]] + list(smooth([np.mean(realDataY[ranking_id][x]) for x in dataX[20:]])) #+smooth([np.mean(realDataY[ranking_id][x]) for x in dataX[20:]] plt.semilogx(dataX, dy[:len(dataX)], label=labels[ranking_id], lw=1, marker=RankingModel.marker[ranking_id]) # for ranking_id in dataY: plt.plot(dataX, [np.mean(realDataY[ranking_id][x]) for x in dataX], label=labels[ranking_id], lw=1, marker=RankingModel.marker[ranking_id]) plt.xlabel('Payloads Per Spam Group', fontsize=15, fontweight='bold') plt.ylabel('Spamness', fontsize=15, fontweight='bold') # plt.title('Spammness with changing global payloads') plt.legend(loc=4) # plt.show() plt.savefig('performanceAsNoOfGlobalPayloadsVary.png') plt.clf()
def generate_tuo_location_and_tuo_neighbor_location_and_influence_score(models_ids, startTime, endTime, outputFolder, hashtag_tag): def get_hashtag_weights(map_from_hashtag_to_tuples_of_occurrences_and_time_range): total_occurrences = sum([len(occurrences) for hashtag, (occurrences, time_range) in map_from_hashtag_to_tuples_of_occurrences_and_time_range.iteritems()]) + 0. return dict([(hashtag, len(occurrences)/total_occurrences) for hashtag, (occurrences, time_range) in map_from_hashtag_to_tuples_of_occurrences_and_time_range.iteritems()]) def get_location_weights(hashtags_for_source_location, map_from_location_to_hashtags): set_of_hashtags_for_source_location = set(hashtags_for_source_location.keys()) return dict([(location, len(set(hashtags.keys()).intersection(set_of_hashtags_for_source_location))/(len(set_of_hashtags_for_source_location)+0.)) for location, hashtags in map_from_location_to_hashtags.iteritems()]) for model_id in models_ids: output_file = tuo_location_and_tuo_neighbor_location_and_influence_score_file%(model_id, hashtag_tag) GeneralMethods.runCommand('rm -rf %s'%output_file) for line_count, location_object in enumerate(iterateJsonFromFile( location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) )): print line_count, model_id tuo_neighbor_location_and_influence_score = [] mf_hashtag_to_hashtag_weights = get_hashtag_weights(location_object['hashtags']) mf_location_to_location_weights = get_location_weights(location_object['hashtags'], location_object['links']) location_hashtag_set = set(location_object['hashtags']) for neighbor_location, mf_hashtag_to_tuo_occurrences_and_time_range in location_object['links'].iteritems(): influence_scores = [] mf_neighbor_location_hashtag_to_hashtag_weights = get_hashtag_weights(mf_hashtag_to_tuo_occurrences_and_time_range) neighbor_location_hashtag_set = set(mf_hashtag_to_tuo_occurrences_and_time_range.keys()) for hashtag, (neighbor_location_occurrences, time_range) in mf_hashtag_to_tuo_occurrences_and_time_range.iteritems(): if hashtag in location_object['hashtags']: location_occurrences = location_object['hashtags'][hashtag][0] pure_influence_score = MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[model_id](location_occurrences, neighbor_location_occurrences) influence_scores.append(mf_hashtag_to_hashtag_weights[hashtag]*pure_influence_score) if hashtag_tag==w_extra_hashtags_tag: for hashtag in location_hashtag_set.difference(neighbor_location_hashtag_set): influence_scores.append(mf_hashtag_to_hashtag_weights[hashtag]*1.0) # influence_scores.append(1.0) for hashtag in neighbor_location_hashtag_set.difference(location_hashtag_set): influence_scores.append(mf_neighbor_location_hashtag_to_hashtag_weights[hashtag]*-1.0) # influence_scores.append(-1.0) mean_influence_scores = np.mean(influence_scores) tuo_neighbor_location_and_influence_score.append([neighbor_location, mf_location_to_location_weights[neighbor_location]*mean_influence_scores]) tuo_neighbor_location_and_influence_score = sorted(tuo_neighbor_location_and_influence_score, key=itemgetter(1)) FileIO.writeToFileAsJson([location_object['id'], tuo_neighbor_location_and_influence_score], output_file)
def compare_zones_with_test_set(ltuo_model_id_and_hashtag_tag, test_model_id): output_file = fld_results%GeneralMethods.get_method_id()+'results.csv' GeneralMethods.runCommand('rm -rf %s'%output_file) mf_model_id_to_misrank_accuracies = defaultdict(list) mf_model_id_to_mf_location_to_zone_id = {} for model_id, hashtag_tag in ltuo_model_id_and_hashtag_tag: no_of_zones, ltuo_location_and_influence_score_and_zone_id = Experiments.get_location_with_zone_ids(model_id, hashtag_tag) locations, influence_scores, zone_ids = zip(*ltuo_location_and_influence_score_and_zone_id) mf_model_id_to_mf_location_to_zone_id[model_id] = dict(zip(locations, zone_ids)) ltuo_hashtag_and_ltuo_location_and_occurrence_time = Experiments.load_ltuo_hashtag_and_ltuo_location_and_occurrence_time() for hashtag_count, (hashtag, ltuo_location_and_occurrence_time) in\ enumerate(ltuo_hashtag_and_ltuo_location_and_occurrence_time): # print hashtag_count # if hashtag_count==10: break; ltuo_location_and_occurrence_time = sorted(ltuo_location_and_occurrence_time, key=itemgetter(1)) # hashtag_zone_ids = [for ltuo_location, _ in ltuo_location_and_occurrence_time] locations = reduce(InfluenceAnalysis._to_locations_based_on_first_occurence, zip(*ltuo_location_and_occurrence_time)[0], []) # mf_location_to_hashtags_location_rank = dict(zip(locations, range(len(locations)))) # for hashtag_count, (hashtag, ltuo_location_and_pure_influence_score) in \ # enumerate(Experiments.load_ltuo_test_hashtag_and_ltuo_location_and_pure_influence_score(test_model_id)): # locations = zip(*ltuo_location_and_pure_influence_score)[0] for model_id, mf_location_to_zone_id in \ mf_model_id_to_mf_location_to_zone_id.iteritems(): models_location_rank = [mf_location_to_zone_id[location] for location in locations if location in mf_location_to_zone_id] # print models_location_rank if len(models_location_rank)>1: misrank_accuracies = map( InfluenceAnalysis._get_rank_accuracy, zip(models_location_rank, [models_location_rank]*len(models_location_rank)) ) mf_model_id_to_misrank_accuracies[model_id].append(np.mean(misrank_accuracies)) #Random model # random_location_rank = range(len(locations)) random_location_rank = models_location_rank random.shuffle(random_location_rank) random_misrank_accuracies = map( InfluenceAnalysis._get_rank_accuracy, zip(random_location_rank, [random_location_rank]*len(random_location_rank)) ) data = ', '.join([str(hashtag_count), str(len(ltuo_location_and_occurrence_time)), str(np.mean(misrank_accuracies)), str(np.mean(random_misrank_accuracies)), str(len(models_location_rank))]) FileIO.writeToFile(data, output_file)
def writeLocationClusters(place): GeneralMethods.runCommand('rm -rf %s'%placesLocationClustersFile%place['name']) clusterId = place.get('k') locations = getLocationWithClusterDetails(place, clusterId) locationVectorsToCluster = [(location, dict((clusterId, len(epochs)) for clusterId, epochs in checkins['checkins'].iteritems())) for location, checkins in locations.values()[0].iteritems()] resultsForVaryingK = [] for k in range(60,80): try: print 'Clustering with k=%s'%k clusters = KMeansClustering(locationVectorsToCluster, k, documentsAsDict=True).cluster(normalise=True, assignAndReturnDetails=True, repeats=5, algorithmSource='biopython') error=clusters['error'] for clusterId, features in clusters['bestFeatures'].items()[:]: clusters['bestFeatures'][str(clusterId)]=[(lid.replace('_', ' '), score)for lid, score in features]; del clusters['bestFeatures'][clusterId] for clusterId, users in clusters['clusters'].items()[:]: clusters['clusters'][str(clusterId)]=users; del clusters['clusters'][clusterId] if error: resultsForVaryingK.append((k, error, clusters, dict((clusterId, GeneralMethods.getRandomColor()) for clusterId in clusters['clusters']))) else: resultsForVaryingK.append((k, meanClusteringDistance(clusters['bestFeatures'].itervalues()), clusters, dict((clusterId, GeneralMethods.getRandomColor()) for clusterId in clusters['clusters']))) # resultsForVaryingK.append((k, meanClusteringDistance(clusters['bestFeatures'].itervalues()), clusters, dict((clusterId, GeneralMethods.getRandomColor()) for clusterId in clusters['clusters']))) except Exception as e: print '*********** Exception while clustering k = %s; %s'%(k, e); pass FileIO.writeToFileAsJson(min(resultsForVaryingK, key=itemgetter(1)), placesLocationClustersFile%place['name']) for data in resultsForVaryingK: FileIO.writeToFileAsJson(data, placesLocationClustersFile%place['name'])
def performanceAsSpammerPayloadVaries(generateData): experimentData = defaultdict(dict) for iteration in range(10): for spammerPayload in range(1,11): experimentFileName = spamModelFolder+'performanceAsSpammerPayloadVaries/%s/%0.3f'%(iteration,spammerPayload) if generateData: model = MixedUsersModel() conf = {'model': model, 'numberOfTimeSteps': 10, 'addUsersMethod': User.addUsersUsingRatio, 'analysisMethods': [(Analysis.measureRankingQuality, 1)], 'ratio': {'normal': 0.985, 'spammer': 0.015}, 'noOfPayloadsPerSpammer': spammerPayload, 'rankingMethods':[RankingModel.latestMessages, RankingModel.latestMessagesDuplicatesRemoved, RankingModel.popularMessages], 'experimentFileName': experimentFileName} GeneralMethods.runCommand('rm -rf %s'%experimentFileName);run(**conf) else: tempData = defaultdict(list) for data in FileIO.iterateJsonFromFile(experimentFileName): for ranking_id in data['spammmess']: tempData[ranking_id]+=data['spammmess'][ranking_id] experimentData[iteration][spammerPayload]=tempData if not generateData: realDataY = defaultdict(dict) for iteration in experimentData: dataY = defaultdict(list) dataX = [] for perct in sorted(experimentData[iteration]): dataX.append(perct) for ranking_id, values in experimentData[iteration][perct].iteritems(): dataY[ranking_id].append(np.mean(values)) dataX=sorted(dataX) for ranking_id in dataY: for x, y in zip(dataX, dataY[ranking_id]): if x not in realDataY[ranking_id]: realDataY[ranking_id][x]=[] realDataY[ranking_id][x].append(y) for ranking_id in dataY: if ranking_id in labels: plt.plot(dataX, [np.mean(realDataY[ranking_id][x]) for x in dataX], label=labels[ranking_id], lw=1, marker=RankingModel.marker[ranking_id]) plt.xlabel('No. of Spam Payload', fontsize=16, fontweight='bold') plt.ylabel('Spamness', fontsize=16, fontweight='bold') # plt.title('Spammness with changing spammer payloads') plt.legend(prop=prop, loc='upper center', bbox_to_anchor=(0.5, 1.12), ncol=3, fancybox=True, shadow=False) # plt.show() plt.savefig('performanceAsSpammerPayloadVaries.png') plt.clf()
def testClassifierPerformance(numberOfTimeUnits=24): validLattices = set() for data in FileIO.iterateJsonFromFile(hashtagsLatticeGraphFile%('world','%s_%s'%(2,11))): validLattices.add(data['id']) documents, lattices = [], set() for h in FileIO.iterateJsonFromFile(hashtagsFile%('training_world','%s_%s'%(2,11))): hashtag, document = Hashtag(h), [] if hashtag.isValidObject(): for timeUnit, occs in enumerate(hashtag.getOccrancesEveryTimeWindowIterator(HashtagsClassifier.CLASSIFIER_TIME_UNIT_IN_SECONDS)): occs = filter(lambda t: t[0] in validLattices, occs) occs = sorted(occs, key=itemgetter(0)) if occs: for lattice in zip(*occs)[0]: lattices.add(lattice) document.append([timeUnit, [(k, len(list(i))) for k, i in groupby(occs, key=itemgetter(0))]]) if document: documents.append(document) lattices = sorted(list(lattices)) print len(lattices) documents = [(d, TargetSelectionRegressionClassifier.getPercentageDistributionInLattice(d)) for d in documents] documents = documents[-int(len(documents)*0.20):] GeneralMethods.runCommand('rm -rf %s'%TargetSelectionRegressionClassifier.classifiersPerformanceFile) for decisionTimeUnit in range(1, numberOfTimeUnits+1): for classifierType in [TargetSelectionRegressionSVMRBFClassifier, TargetSelectionRegressionSVMLinearClassifier, TargetSelectionRegressionSVMPolyClassifier, TargetSelectionRegressionClassifier]: totalError = [] for latticeCount, predictingLattice in enumerate(lattices): inputVectors, outputValues, tempError = [], [], [] for rawDocument, processedDocument in documents: documentForTimeUnit = TargetSelectionRegressionClassifier.getPercentageDistributionInLattice(rawDocument[:decisionTimeUnit]) if documentForTimeUnit and processedDocument: vector = [documentForTimeUnit.get(l, 0) for l in lattices] inputVectors.append(vector), outputValues.append(float(processedDocument.get(predictingLattice, 0))) classifier = classifierType(decisionTimeUnit=decisionTimeUnit, predictingLattice=predictingLattice) for iv, ov in zip(inputVectors, outputValues): if latticeCount==2: print ov, classifier.predict(iv), pow(ov-classifier.predict(iv), 2) if ov!=0.0: tempError.append(pow(ov-classifier.predict(iv), 2)) # print tempError, np.mean(tempError) # exit() totalError.append(np.mean(tempError)) print {'id': classifier.id, 'timeUnit': decisionTimeUnit-1, 'error': np.mean(totalError)} FileIO.writeToFileAsJson({'id': classifier.id, 'timeUnit': decisionTimeUnit-1, 'error': np.mean(totalError)}, TargetSelectionRegressionClassifier.classifiersPerformanceFile)
def plot_locations_influence_on_world_map(ltuo_model_id_and_hashtag_tag, noOfInfluencers=10, percentage_of_locations=0.15): input_locations = [ ('40.6000_-73.9500', 'new_york'), ('33.3500_-118.1750', 'los_angeles'), ('29.7250_-97.1500', 'austin'), ('30.4500_-95.7000', 'college_station'), ('-22.4750_-42.7750', 'rio'), ('51.4750_0.0000', 'london'), ('-23.2000_-46.4000', 'sao_paulo') ] for model_id, hashtag_tag in ltuo_model_id_and_hashtag_tag: tuo_location_and_tuo_neighbor_location_and_locations_influence_score = \ Experiments.load_tuo_location_and_tuo_neighbor_location_and_locations_influence_score(model_id, hashtag_tag, noOfInfluencers=None, influence_type=InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE) for input_location, label in input_locations: for location, tuo_neighbor_location_and_locations_influence_score in \ tuo_location_and_tuo_neighbor_location_and_locations_influence_score: if input_location==location: input_location = getLocationFromLid(input_location.replace('_', ' ')) output_file = fld_results%GeneralMethods.get_method_id() + '/%s_%s/%s.png'%(model_id, hashtag_tag, label) number_of_outgoing_influences = int(len(tuo_neighbor_location_and_locations_influence_score)*percentage_of_locations) if number_of_outgoing_influences==0: number_of_outgoing_influences=len(tuo_neighbor_location_and_locations_influence_score) locations = zip(*tuo_neighbor_location_and_locations_influence_score)[0][:number_of_outgoing_influences] locations = [getLocationFromLid(location.replace('_', ' ')) for location in locations] # locations = filter(lambda location: isWithinBoundingBox(location, PARTIAL_WORLD_BOUNDARY), locations) if locations: _, m = plotPointsOnWorldMap(locations, resolution='c', blueMarble=False, bkcolor='#000000', c='#FF00FF', returnBaseMapObject=True, lw = 0) # _, m = plotPointsOnWorldMap(locations, resolution='c', blueMarble=False, bkcolor='#CFCFCF', c='#FF00FF', returnBaseMapObject=True, lw = 0) for location in locations: # if isWithinBoundingBox(location, PARTIAL_WORLD_BOUNDARY): m.drawgreatcircle(location[1], location[0], input_location[1], input_location[0], color='#FAA31B', lw=1., alpha=0.5) # plotPointsOnWorldMap([input_location], blueMarble=False, bkcolor='#CFCFCF', c='#003CFF', s=40, lw = 0) plotPointsOnWorldMap([input_location], resolution='c', blueMarble=False, bkcolor='#000000', c='#003CFF', s=40, lw = 0) # plotPointsOnWorldMap([input_location], resolution='c', blueMarble=False, bkcolor='#CFCFCF', c='#003CFF', s=40, lw = 0) FileIO.createDirectoryForFile(output_file) print output_file savefig(output_file) plt.clf() else: GeneralMethods.runCommand('rm -rf %s'%output_file) break
def analyzeRunningTime(graphs, graphType, numberOfPoints=50): edgesToKeep = 0.35 def getRunningTime(graphs, linear): graphMap = dict(graphs) startingGraphId, endingGraphId = min(graphMap.keys()), max(graphMap.keys()) timeDifference = endingGraphId-startingGraphId LocationGraphs.updateLogarithmicGraphs(graphMap, edgesToKeep=edgesToKeep) dataToReturn = [] for j, intervalInSeconds in enumerate(range(0, timeDifference, int(timeDifference/numberOfPoints))): ts = time.time() graph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=linear, edgesToKeep=edgesToKeep) noOfClusters, clusters = clusterUsingAffinityPropagation(graph) clusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusters, key=itemgetter(1)), key=itemgetter(1))] te = time.time() edgeWeights = sum(data['w'] for _,_,data in graph.edges(data=True)) print graphType, linear, len(clusters), graph.number_of_nodes(), graph.number_of_edges(), edgeWeights, j, te-ts dataToReturn.append({'intervalInSeconds': intervalInSeconds, 'runningTime': te-ts, 'clusters': clusters, 'noOfNodes': graph.number_of_nodes()}) return dataToReturn graphFile = runningTimesFolder%graphType print graphFile GeneralMethods.runCommand('rm -rf %s'%graphFile) for linear in [False, True]: FileIO.writeToFileAsJson({'linear': linear, 'analysis': getRunningTime(graphs, linear)}, graphFile)
def plot_maps_for_every_hour(): MINUTES = 15 hashtags = ['ripstevejobs', 'cnbcdebate'] map_from_hashtag_to_subplot = dict([('ripstevejobs', 211), ('cnbcdebate', 212)]) map_from_epoch_lag_to_map_from_hashtag_to_tuples_of_location_and_epoch_lag = defaultdict(dict) for hashtag in hashtags: for hashtag_object in FileIO.iterateJsonFromFile('./data/%s.json'%hashtag): map_from_epoch_time_unit_to_tuples_of_location_and_epoch_occurrence_time = getOccurranceDistributionInEpochs(getOccuranesInHighestActiveRegion(hashtag_object), timeUnit=MINUTES*60, fillInGaps=True, occurancesCount=False) tuples_of_epoch_time_unit_and_tuples_of_location_and_epoch_occurrence_time = sorted(map_from_epoch_time_unit_to_tuples_of_location_and_epoch_occurrence_time.iteritems(), key=itemgetter(0)) epoch_starting_time_unit = tuples_of_epoch_time_unit_and_tuples_of_location_and_epoch_occurrence_time[0][0] epoch_ending_time_unit = epoch_starting_time_unit+24*60*60 for epoch_time_unit, tuples_of_location_and_epoch_occurrence_time in tuples_of_epoch_time_unit_and_tuples_of_location_and_epoch_occurrence_time: if epoch_time_unit<=epoch_ending_time_unit: if tuples_of_location_and_epoch_occurrence_time: epoch_lag = epoch_time_unit - epoch_starting_time_unit tuples_of_location_and_epoch_occurrence_time = sorted(tuples_of_location_and_epoch_occurrence_time, key=itemgetter(1)) map_from_epoch_lag_to_map_from_hashtag_to_tuples_of_location_and_epoch_lag[epoch_lag][hashtag] = [(getLatticeLid(location, 0.145), epoch_occurrence_time-epoch_starting_time_unit)for location, epoch_occurrence_time in tuples_of_location_and_epoch_occurrence_time] map_from_hashtag_to_accumulated_tuples_of_location_and_epoch_lag = defaultdict(list) GeneralMethods.runCommand('rm -rf ./images/plot_maps_for_every_hour/') for epoch_lag in sorted(map_from_epoch_lag_to_map_from_hashtag_to_tuples_of_location_and_epoch_lag): file_world_map_plot = './images/plot_maps_for_every_hour/%s.png'%(epoch_lag) print file_world_map_plot map_from_hashtag_to_tuples_of_location_and_epoch_lag = map_from_epoch_lag_to_map_from_hashtag_to_tuples_of_location_and_epoch_lag[epoch_lag] for hashtag, tuples_of_location_and_epoch_lag in map_from_hashtag_to_tuples_of_location_and_epoch_lag.iteritems(): map_from_hashtag_to_accumulated_tuples_of_location_and_epoch_lag[hashtag]+=tuples_of_location_and_epoch_lag for hashtag, accumulated_tuples_of_location_and_epoch_lag in map_from_hashtag_to_accumulated_tuples_of_location_and_epoch_lag.iteritems(): plt.subplot(map_from_hashtag_to_subplot[hashtag]) tuples_of_location_and_epoch_max_lag= [(location, max(zip(*iterator_of_tuples_of_location_and_epoch_lag)[1])) for location, iterator_of_tuples_of_location_and_epoch_lag in groupby(sorted(accumulated_tuples_of_location_and_epoch_lag, key=itemgetter(0)), key=itemgetter(0)) ] locations, colors = zip(*[(getLocationFromLid(location.replace('_', ' ')), (epoch_lag+MINUTES*60)-epoch_max_lag) for location, epoch_max_lag in sorted(tuples_of_location_and_epoch_max_lag, key=itemgetter(1), reverse=True)]) plotPointsOnWorldMap(locations, blueMarble=False, bkcolor='#CFCFCF', c=colors, cmap=matplotlib.cm.cool, lw = 0, vmax=epoch_lag+MINUTES*60) plt.title('%s (%s hours)'%(hashtag, (epoch_lag+MINUTES*60)/(60.*60))) # plt.show() FileIO.createDirectoryForFile(file_world_map_plot) plt.savefig(file_world_map_plot) plt.clf()
def locations_at_top_and_bottom(model_ids, no_of_locations=5): for model_id in model_ids: output_file_format = analysis_folder+'%s/'%(GeneralMethods.get_method_id())+'%s/%s.json' input_locations = [ # ('40.6000_-73.9500', 'new_york'), ('30.4500_-95.7000', 'college_station'), ] tuo_location_and_tuo_neighbor_location_and_influence_score = \ Experiments.load_tuo_location_and_tuo_neighbor_location_and_pure_influence_score(model_id) for input_location, label in input_locations: for location, tuo_neighbor_location_and_influence_score in \ tuo_location_and_tuo_neighbor_location_and_influence_score: if input_location==location: output_file = output_file_format%(input_location, model_id) GeneralMethods.runCommand('rm -rf %s'%output_file) FileIO.createDirectoryForFile(output_file) FileIO.writeToFileAsJson("Bottom:", output_file) for neighbor_location_and_influence_score in tuo_neighbor_location_and_influence_score[:no_of_locations]: FileIO.writeToFileAsJson(neighbor_location_and_influence_score+[''], output_file) FileIO.writeToFileAsJson("Top:", output_file) for neighbor_location_and_influence_score in \ reversed(tuo_neighbor_location_and_influence_score[-no_of_locations:]): FileIO.writeToFileAsJson(neighbor_location_and_influence_score+[''], output_file)
def copy_file(input_file, output_file): command = 'cp %s %s' % (input_file, output_file) GeneralMethods.runCommand(command)
def getMahoutOutput(minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation): GeneralMethods.runCommand( "mahout seqdumper -s fi/output/frequentpatterns/part-r-00000 > %s" % locationsFIMahoutOutputFile % (minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, minSupport) )
def performanceWithSpamDetectionVaryingPercentageOfSpammers(generateData): experimentData = defaultdict(dict) ratios = [0.0,0.4,0.9] marker = dict([(0.0, 's'), (0.4, 'o'), (0.9, 'd')]) # spammerPercentages = [0.2, 0.01, 0.01] # spammerPercentages = [0.015, 0.015, 0.015] for iteration in range(10): l1 = [spammerPercentage* 0.001 for spammerPercentage in range(1,51)] l2 = [spammerPercentage* 0.05 for spammerPercentage in range(1,21)] l3 = [0.01]+l2 spammer_percentages = l3 for spammerPercentage in spammer_percentages: for spamDetectionRatio, spammerPercentage in zip(ratios, [spammerPercentage]*3): experimentFileName = spamModelFolder+'performanceWithSpamDetectionVaryingPercentageOfSpammers/%s/%0.3f/%0.3f'%(iteration,spammerPercentage, spamDetectionRatio) print experimentFileName if generateData: model = MixedUsersModel() conf = {'model': model, 'numberOfTimeSteps': 10, 'addUsersMethod': User.addUsersUsingRatioWithSpamDetection, 'analysisMethods': [(Analysis.measureRankingQuality, 1)], 'ratio': {'normal': 1-spammerPercentage, 'spammer': spammerPercentage}, # 'spammerMessagingProbability': spammerBudget, 'rankingMethods':[RankingModel.latestMessages, RankingModel.latestMessagesSpamFiltered, RankingModel.popularMessages, RankingModel.popularMessagesSpamFiltered], 'spamDetectionRatio': spamDetectionRatio, 'experimentFileName': experimentFileName} GeneralMethods.runCommand('rm -rf %s'%experimentFileName);run(**conf) else: # for data in FileIO.iterateJsonFromFile(experimentFileName): # for ranking_id in data['spammmess']: # if data['currentTimeStep'] not in experimentData[spamDetectionRatio]: experimentData[spamDetectionRatio][data['currentTimeStep']]=defaultdict(list) # experimentData[spamDetectionRatio][data['currentTimeStep']][ranking_id]+=data['spammmess'][ranking_id] tempData = defaultdict(list) for data in FileIO.iterateJsonFromFile(experimentFileName): for ranking_id in data['spammmess']: tempData[ranking_id]+=data['spammmess'][ranking_id] if spammerPercentage not in experimentData[spamDetectionRatio]: experimentData[spamDetectionRatio][spammerPercentage]=defaultdict(list) for ranking_id in tempData: experimentData[spamDetectionRatio][spammerPercentage][ranking_id]+=tempData[ranking_id] if not generateData: sdr = {} for spamDetectionRatio in sorted(experimentData.keys()): dataToPlot = defaultdict(list) # for spammerPercentage in sorted(experimentData[spamDetectionRatio]): for spammerPercentage in spammer_percentages: dataToPlot['x'].append(spammerPercentage) for ranking_id in experimentData[spamDetectionRatio][spammerPercentage]: dataToPlot[ranking_id].append(np.mean(experimentData[spamDetectionRatio][spammerPercentage][ranking_id])) sdr[spamDetectionRatio]=dataToPlot # for ranking_id in [RankingModel.LATEST_MESSAGES_SPAM_FILTERED, RankingModel.POPULAR_MESSAGES_SPAM_FILTERED]: for ranking_id in [RankingModel.LATEST_MESSAGES, RankingModel.POPULAR_MESSAGES]: for spamDetectionRatio in ratios: print ranking_id, spamDetectionRatio # dataY = smooth(sdr[spamDetectionRatio][ranking_id],8)[:len(sdr[spamDetectionRatio]['x'])] dataY = sdr[spamDetectionRatio][ranking_id][:len(sdr[spamDetectionRatio]['x'])] # dataX, dataY = sdr[spamDetectionRatio]['x'][10:], dataY[10:] dataX, dataY = sdr[spamDetectionRatio]['x'], dataY # dataX, dataY = splineSmooth(dataX, dataY) # if spamDetectionRatio==0.0: plt.plot([x-10 for x in dataX], dataY, label='%s'%(labels[ranking_id]), lw=1, marker=marker[spamDetectionRatio]) # else: plt.plot([x-10 for x in dataX], dataY, label='%s (%d'%(labels[ranking_id].replace('Filtering', 'Detection'),spamDetectionRatio*100)+'%)', lw=1, marker=marker[spamDetectionRatio]) if spamDetectionRatio==0.0: plt.plot(dataX, dataY, label='%s'%(labels[ranking_id]), lw=1, marker=marker[spamDetectionRatio]) else: plt.plot(dataX, dataY, label='%s after spam detection (%d'%(labels[ranking_id].replace('Filtering', 'Detection'),spamDetectionRatio*100)+'%)', lw=1, marker=marker[spamDetectionRatio]) # plt.show() # plt.xlim(xmax=0.05) # plt.ylim(ymax=0.8) plt.legend(loc=4) plt.xlabel('Time', fontsize=16, fontweight='bold') plt.ylabel('Spamness', fontsize=16, fontweight='bold') # plt.show() # plt.savefig('performanceWithSpamDetectionVaryingPercentageOfSpammers_%s.png'%ranking_id) savefig('/Users/krishnakamath/Dropbox/temp/performanceWithSpamDetectionVaryingPercentageOfSpammers_%s.png'%ranking_id) # plt.show() plt.clf()
def calculateFrequentLocationItemsets(minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation): inputFile = locationsFIMahoutInputFile % (minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation) GeneralMethods.runCommand("rm -rf %s.*" % inputFile) GeneralMethods.runCommand("hadoop fs -rmr fi/*") GeneralMethods.runCommand("tar -cvf %s.tar %s" % (inputFile, inputFile)) GeneralMethods.runCommand("gzip %s.tar" % (inputFile)) GeneralMethods.runCommand("hadoop fs -put %s.tar.gz fi/." % inputFile) GeneralMethods.runCommand( "mahout fpg -i fi/mh_input_%s_%s.tar.gz -o fi/output -k 50 -g 100000 -method mapreduce -s %s" % (minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, minSupport) )
def writeHDFSFileForValidLocationAndUser(minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation): for locationVector in filteredLocationToUserAndTimeMapIterator(minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation): for user in locationVector['users'].keys()[:]: locationVector['users'][str(user)]=locationVector['users'][user]; del locationVector['users'][user] FileIO.writeToFileAsJson(locationVector, validLocationAndUserFile) GeneralMethods.runCommand('hadoop fs -put %s %s'%(validLocationAndUserFile, validLocationAndUserHdfsPath))
def runMRJob(mrJobClass, outputFileName, inputFile=checkinsHdfsPath, args='-r hadoop'.split(), **kwargs): mrJob = mrJobClass(args='-r hadoop'.split()) GeneralMethods.runCommand('rm -rf %s'%outputFileName) for l in mrJob.runJob(inputFileList=[inputFile], **kwargs): FileIO.writeToFileAsJson(l[1], outputFileName)
def writeTopFeaturesForCluster(): clustersFileName = '%s/topFeaturesForCluster'%placesAnalysisFolder%place['name'] GeneralMethods.runCommand('rm -rf %s'%clustersFileName) for data in FileIO.iterateJsonFromFile(placesUserClusterFeaturesFile%place['name']): clusterId, color, features = data FileIO.writeToFileAsJson([clusterId, [f[2] for f in features[:noOfFeatures]]], clustersFileName)
def writeARFFFile(place): userVectors = GenerateDataFiles.getUserVectors(place) arffFile=ARFF.writeARFFForClustering(userVectors, place['name']) outputFileName = placesARFFFile%place['name'] FileIO.createDirectoryForFile(outputFileName) GeneralMethods.runCommand('mv %s %s'%(arffFile, outputFileName))