Exemplo n.º 1
0
    def generate_data_for_significant_nei_utm_ids():
        output_file = GeneralMethods.get_method_id()+'.json'
        so_hashtags, mf_utm_id_to_valid_nei_utm_ids = set(), {}
        for utm_object in \
                FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True):
            for hashtag, count in utm_object['mf_hashtag_to_count'].iteritems():
                if hashtag!='total_num_of_occurrences': so_hashtags.add(hashtag)
            mf_utm_id_to_valid_nei_utm_ids[utm_object['utm_id']] =\
                                                            utm_object['mf_nei_utm_id_to_common_h_count'].keys()
        hashtags = sorted(list(so_hashtags))
        mf_utm_id_to_vector = {}
        for utm_object in FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True):
#                print i, utm_object['utm_id']
            utm_id_vector =  map(lambda hashtag: utm_object['mf_hashtag_to_count'].get(hashtag, 0.0),
                                 hashtags)
            mf_utm_id_to_vector[utm_object['utm_id']] = robjects.FloatVector(utm_id_vector)
        for i, (utm_id, vector) in enumerate(mf_utm_id_to_vector.iteritems()):
            print '%s of %s'%(i+1, len(mf_utm_id_to_vector))
            ltuo_utm_id_and_vector = [(utm_id, vector)]
            for valid_nei_utm_id in mf_utm_id_to_valid_nei_utm_ids[utm_id]:
                if valid_nei_utm_id in mf_utm_id_to_vector and valid_nei_utm_id!=utm_id:
                    ltuo_utm_id_and_vector.append((valid_nei_utm_id, mf_utm_id_to_vector[valid_nei_utm_id]))
            od = rlc.OrdDict(sorted(ltuo_utm_id_and_vector, key=itemgetter(0)))
            df_utm_vectors = robjects.DataFrame(od)
            df_utm_vectors_json = R_Helper.get_json_for_data_frame(df_utm_vectors)
            dfm_dict = cjson.decode(df_utm_vectors_json)
            mf_utm_ids_to_utm_colnames = dict(zip(zip(*ltuo_utm_id_and_vector)[0], df_utm_vectors.colnames))
            utm_id_colname = mf_utm_ids_to_utm_colnames[utm_id]
            dfm_dict['prediction_variable'] = utm_id_colname
            dfm_dict['predictor_variables'] = filter(lambda colname: colname!=utm_id_colname,
                                                     df_utm_vectors.colnames)
            dfm_dict['mf_utm_colnames_to_utm_ids'] = dict(zip(df_utm_vectors.colnames, zip(*ltuo_utm_id_and_vector)[0]))
            FileIO.writeToFileAsJson(dfm_dict, output_file)
Exemplo n.º 2
0
 def generate_hashtag_specific_location_and_pure_influence_scores(test_models_ids):
     for test_model_id in test_models_ids:
         output_file = f_ltuo_hashtag_and_ltuo_location_and_pure_influence_score%(test_model_id)
         GeneralMethods.runCommand('rm -rf %s'%output_file)
         ltuo_hashtag_and_ltuo_location_and_occurrence_time = Experiments.load_ltuo_hashtag_and_ltuo_location_and_occurrence_time()
         for hashtag_count, (hashtag, ltuo_location_and_occurrence_time) in\
                 enumerate(ltuo_hashtag_and_ltuo_location_and_occurrence_time):
             ltuo_location_and_occurrence_times = [(location, sorted(zip(*ito_location_and_occurrence_time)[1]))
                                                     for location, ito_location_and_occurrence_time in
                                                         groupby(
                                                                 sorted(ltuo_location_and_occurrence_time, key=itemgetter(0)),
                                                                 key=itemgetter(0)
                                                         )
                                                 ] 
             print hashtag_count, test_model_id
             ltuo_location_and_pure_influence_score = []
             for location, location_occurrence_times in ltuo_location_and_occurrence_times:
                 pure_influence_scores = []
                 for neighbor_location, neighbor_location_occurrence_times in ltuo_location_and_occurrence_times:
                     if location!=neighbor_location:
                         pure_influence_score = MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[test_model_id](neighbor_location_occurrence_times, location_occurrence_times)
                         pure_influence_scores.append(pure_influence_score)
                 ltuo_location_and_pure_influence_score.append([location, np.mean(pure_influence_scores)])
             ltuo_location_and_pure_influence_score = sorted(ltuo_location_and_pure_influence_score, key=itemgetter(1))
             FileIO.writeToFileAsJson([hashtag, ltuo_location_and_pure_influence_score], output_file)
Exemplo n.º 3
0
    def trendCurves(iterationData=None, experimentFileName=None):
        if iterationData: 
            currentTimeStep, _, currentTopics, _, finalCall, conf = iterationData
            experimentFileName = conf['experimentFileName']
            if not finalCall:
                topicDistribution = dict((str(topic.id), {'total': topic.totalCount, 'timeStep': topic.countDistribution[currentTimeStep]}) for topic in currentTopics)
#                print currentTimeStep
                FileIO.writeToFileAsJson({'t':currentTimeStep, 'topics':topicDistribution}, experimentFileName)
            else:
                iterationInfo  = {'trending_topics': [topic.id for topic in currentTopics if topic.stickiness>=stickinessLowerThreshold],
                      'topic_colors': dict((str(topic.id), topic.color) for topic in currentTopics),
                      'conf': conf}
                del iterationInfo['conf']['spamDectectionMethod']
                FileIO.writeToFileAsJson(iterationInfo, experimentFileName)
        else:
            topicsDataX = defaultdict(list)
            topicsDataY = defaultdict(list)
            for data in FileIO.iterateJsonFromFile(experimentFileName):
                if 'conf' not in data:
                    for topic in data['topics']: topicsDataX[topic].append(data['t']), topicsDataY[topic].append(data['topics'][topic]['timeStep'])
                else: topicColorMap=data['topic_colors']; trendingTopics=data['trending_topics']
            for topic in topicsDataX: plt.fill_between(topicsDataX[topic], topicsDataY[topic], color=topicColorMap[str(topic)], alpha=1.0)
            plt.figure()
            for topic in trendingTopics: plt.fill_between(topicsDataX[str(topic)], topicsDataY[str(topic)], color=topicColorMap[str(topic)], alpha=1.0)
            plt.ylabel('Number of Contents', fontsize=16, fontweight='bold')
            plt.show()
Exemplo n.º 4
0
    def generate_tuo_location_and_tuo_neighbor_location_and_pure_influence_score(models_ids, startTime, endTime, outputFolder, hashtag_tag):
        for model_id in models_ids:
#            if w_extra_hashtags: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag)
#            else: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, wout_extra_hashtags_tag)
            output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag)
            GeneralMethods.runCommand('rm -rf %s'%output_file)
            for line_count, location_object in enumerate(iterateJsonFromFile(
                     location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
                     )):
                print line_count, model_id
                tuo_neighbor_location_and_pure_influence_score = []
                location_hashtag_set = set(location_object['hashtags'])
                for neighbor_location, mf_hashtag_to_tuo_occurrences_and_time_range in location_object['links'].iteritems():
                    pure_influence_scores = []
                    for hashtag, (neighbor_location_occurrences, time_range) in mf_hashtag_to_tuo_occurrences_and_time_range.iteritems():
                        if hashtag in location_object['hashtags']:
                            location_occurrences = location_object['hashtags'][hashtag][0]
                            pure_influence_scores.append(MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[model_id](location_occurrences, neighbor_location_occurrences))
                    neighbor_location_hashtag_set = set(mf_hashtag_to_tuo_occurrences_and_time_range.keys())
                    if hashtag_tag==w_extra_hashtags_tag:
                        for hashtag in location_hashtag_set.difference(neighbor_location_hashtag_set): pure_influence_scores.append(1.0)
                        for hashtag in neighbor_location_hashtag_set.difference(location_hashtag_set): pure_influence_scores.append(-1.0)
                    mean_pure_influence_score = np.mean(pure_influence_scores)
                    tuo_neighbor_location_and_pure_influence_score.append([neighbor_location, mean_pure_influence_score])
                tuo_neighbor_location_and_pure_influence_score = sorted(tuo_neighbor_location_and_pure_influence_score, key=itemgetter(1))
                FileIO.writeToFileAsJson([location_object['id'], tuo_neighbor_location_and_pure_influence_score], output_file)
Exemplo n.º 5
0
def generateRadiusSpots(radiusInMiles):
    graph = nx.Graph()
    spotsFile = radiusSpotsFolder+'%s'%(radiusInMiles)
    print 'Creating:', spotsFile
    for lid in locationIterator():
        for location in nearbyLocations(lid, radiusInMiles): graph.add_edge(location['_id'], lid)
    for locations in nx.connected_components(graph): FileIO.writeToFileAsJson({'venues': locations}, spotsFile)
Exemplo n.º 6
0
 def generate_tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity(model_ids, startTime, endTime, outputFolder):
     def location_similarity(location_vector_1, location_vector_2): 
         return reduce(lambda total, k: total+(location_vector_1.get(k,0)*location_vector_2.get(k,0)), set(location_vector_1.keys()).union(location_vector_2.keys()),0.)
     influence_types=[InfluenceMeasuringModels.TYPE_COMPLETE_INFLUENCE, InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE, InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE]
     for model_id in model_ids:
         mf_location_to_mf_influence_type_to_influence_vector = dict(Experiments.load_tuo_location_and_mf_influence_type_to_influence_vector(model_id))
         GeneralMethods.runCommand('rm -rf %s'%tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id)
         for line_count, location_object in enumerate(iterateJsonFromFile(
                      location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
                  )):
             print line_count
             location = location_object['id']
             tuo_neighbor_location_and_mf_influence_type_and_similarity = []
             for neighbor_location in location_object['links'].keys(): 
                 mf_influence_type_and_similarity = {}
                 for influence_type in influence_types:
                     similarity = location_similarity( 
                                                          mf_location_to_mf_influence_type_to_influence_vector[location][influence_type],
                                                          mf_location_to_mf_influence_type_to_influence_vector[neighbor_location][influence_type]
                                                   )
                     mf_influence_type_and_similarity[influence_type] = similarity
                 so_hashtags_for_location = set(location_object['hashtags'].keys())
                 so_hashtags_for_neighbor_location = set(location_object['links'][neighbor_location].keys())
                 numerator = len(so_hashtags_for_location.intersection(so_hashtags_for_neighbor_location)) + 0.
                 denominator = len(so_hashtags_for_location.union(so_hashtags_for_neighbor_location)) + 0.
                 mf_influence_type_and_similarity[JACCARD_SIMILARITY] = numerator/denominator                
                 tuo_neighbor_location_and_mf_influence_type_and_similarity.append([neighbor_location, mf_influence_type_and_similarity])
             FileIO.writeToFileAsJson(
                                      [location, tuo_neighbor_location_and_mf_influence_type_and_similarity],
                                      tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id
                                      )
 def generateStatsForMRKMeansClusteringQuality():
     for i in [90000, 100000, 200000, 300000, 400000, 500000]: 
         print 'Generating stats for: ',i
         tf = TweetsFile(i, **experts_twitter_stream_settings)
         FileIO.writeToFileAsJson({'mr_k_means': tf.generateStatsForKMeansMRClustering(), 
                                   'settings': Settings.getSerialzedObject(tf.stream_settings)}, 
                                   TweetsFile.mr_stats_file)
Exemplo n.º 8
0
    def measureRankingQuality(iterationData=None, experimentFileName=None):
        #        def getTopTopics(model, noOfTopics):
        #            topics = set()
        #            topTopics = model.topTopics[:]
        #            while True:
        #                topicIndex = GeneralMethods.weightedChoice([i[1] for i in topTopics])
        #                topic = topTopics[topicIndex][0].id
        #                del topTopics[topicIndex]
        #                if topic not in topics: topics.add(topic)
        #                if len(topics)==noOfTopics or len(topics)==len(model.topTopics): break
        #            return [(t, 0) for t in topics]

        if iterationData:
            currentTimeStep, model, _, _, finalCall, conf = iterationData
            if not finalCall:
                rankingMethods = conf["rankingMethods"]
                experimentFileName = conf["experimentFileName"]
                topTopics = sorted(model.topicsDistributionInTheTimeSet.iteritems(), key=itemgetter(1), reverse=True)[
                    :10
                ]
                #                topTopics = getTopTopics(model, 10)
                #                topTopics = random.sample(sorted(model.topicsDistributionInTheTimeSet.iteritems(), key=itemgetter(1), reverse=True)[:10], min(len(model.topicsDistributionInTheTimeSet),5))
                #                topTopics = random.sample(model.topicsDistributionInTheTimeSet.items(), min(len(model.topicsDistributionInTheTimeSet),5))
                iterationData = {"currentTimeStep": currentTimeStep, "spammmess": defaultdict(list)}
                for rankingMethod in rankingMethods:
                    for queryTopic, _ in topTopics:
                        ranking_id, messages = rankingMethod(queryTopic, model.topicToMessagesMap, **conf)
                        #                        if spammness(messages, norm_k)==0:
                        #                            print 'c'
                        #                        print rankingMethod, spammness(messages, norm_k)
                        iterationData["spammmess"][ranking_id].append(spammness(messages, norm_k))
                #                        print ranking_id, spammness(messages, norm_k)
                FileIO.writeToFileAsJson(iterationData, experimentFileName)
                model.topicsDistributionInTheTimeSet = defaultdict(int)
 def generate(self):
     i=0
     for tweet in TwitterIterators.iterateTweetsFromExperts(): 
         FileIO.writeToFileAsJson(tweet, self.fileName)
         i+=1
         if i==self.length: break;
     os.system('gzip %s'%self.fileName)
 def modifiedClusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime):
     global evaluation, previousTime
     currentTime = time.time()
     documentClusters = [
         cluster.documentsInCluster.keys()
         for k, cluster in hdStreamClusteringObject.clusters.iteritems()
         if len(cluster.documentsInCluster.keys()) >= experts_twitter_stream_settings["cluster_filter_threshold"]
     ]
     iteration_data = evaluation.getEvaluationMetrics(
         documentClusters,
         currentTime - previousTime,
         {
             "type": experts_twitter_stream_settings["dimensions_performance_type"],
             "dimensions": experts_twitter_stream_settings["dimensions"],
         },
     )
     iteration_data["no_of_observed_dimensions"] = len(hdStreamClusteringObject.phraseTextToPhraseObjectMap)
     previousTime = time.time()
     FileIO.writeToFileAsJson(iteration_data, JustifyDimensionsEstimation.stats_file)
     del iteration_data["clusters"]
     print currentMessageTime, iteration_data
     if experts_twitter_stream_settings["dimensions"] != 76819 and 2 * experts_twitter_stream_settings[
         "dimensions"
     ] <= len(hdStreamClusteringObject.phraseTextToPhraseObjectMap):
         raise Exception
Exemplo n.º 11
0
def mr_data_analysis(input_files_start_time, input_files_end_time, min_hashtag_occurrences):
#    output_file = f_tuo_normalized_occurrence_count_and_distribution_value%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tweet_count_stats%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_lid_and_distribution_value%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
    output_file = f_tuo_hashtag_and_occurrence_count_and_entropy_and_focus_and_coverage_and_peak%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_rank_and_average_percentage_of_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

#    output_file = f_tuo_iid_and_interval_stats%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_iid_and_perct_change_of_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_normalized_iid_and_tuo_prct_of_occurrences_and_entropy_and_focus_and_coverage%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

#    output_file = f_hashtag_objects%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

#    output_file = f_tuo_lid_and_ltuo_other_lid_and_temporal_distance%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_lid_and_ltuo_other_lid_and_no_of_co_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_high_accuracy_lid_and_distribution%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_no_of_hashtags_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_no_of_locations_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

#    output_file = f_tuo_no_of_peak_lids_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

    print PARAMS_DICT
#    runMRJob(MRAnalysis, output_file, getInputFiles(input_files_start_time, input_files_end_time), jobconf={'mapred.reduce.tasks':300})
    runMRJob(MRAnalysis, output_file, getPreprocessedHashtagsFile(), jobconf={'mapred.reduce.tasks':300})
    FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
 def dimensionsEstimation(estimationObject, currentMessageTime):
     '''
     This class is used to dimensionsEstimation dimensions in the stream. To dimensionsEstimation it we calculate
     the number of phrases that need to added every iteration for different dimensions.
     The dimension at which the number of phrases added stablizes is the number of dimensions
     for the stream.
     
     Why do we need this?
     The aim is to get dimensions, that dont change too often at the same time are not very huge.
     This experiments gives us an approximate idea of the number of dimensions. Randomly picking 
     a small value will result in dimensions that are not good and picking too big a value will 
     result in inefficiency.  
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     topDimensionsDuringCurrentIteration = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)]
     oldList, newList = estimationObject.topDimensionsDuringPreviousIteration, topDimensionsDuringCurrentIteration
     if estimationObject.topDimensionsDuringPreviousIteration:
         dimensions_estimation = {}
         for boundary in estimationObject.boundaries:
             if boundary < len(estimationObject.phraseTextToPhraseObjectMap): dimensions_estimation[str(boundary)] = len(set(newList[:boundary]).difference(oldList[:boundary]))
         print currentMessageTime, len(estimationObject.phraseTextToPhraseObjectMap)
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': estimationObject.stream_settings.convertToSerializableObject(),
                          ParameterEstimation.dimensionsEstimationId:dimensions_estimation
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsEstimationFile)
     estimationObject.topDimensionsDuringPreviousIteration = topDimensionsDuringCurrentIteration[:]
 def dimensionsUpdateFrequencyEstimation(estimationObject, currentMessageTime):
     '''
     Observe the new dimensions that get added to current dimension if the dimensions 
     are being updated at regular intervals.
     For example, number of dimensions being added after 10m, 20m,... 5 horus. 
     As time increases the number of 'decayed' dimensions increase. The current dimensions
     has a lot of unwanted decayed dimensions. Using this information identify the time 
     interval that is best suited to refresh dimensions. 
     Tentative: We decide to pick the time interval at which the rate of decay is maximum.
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     dimensions = estimationObject.stream_settings['dimensions']
     newList = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)][:dimensions]
     print currentMessageTime, len(newList)
     if len(newList) >= dimensions:
         idsOfDimensionsListToCompare = [(i, GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i)) for i in estimationObject.dimensionUpdateTimeDeltas if GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i) in estimationObject.dimensionListsMap]
         dimensionsUpdateFrequency = {}
         for td, id in idsOfDimensionsListToCompare:
             oldList = estimationObject.dimensionListsMap[id]
             dimensionsUpdateFrequency[str(td.seconds)] = len(set(newList).difference(oldList))
         print len(estimationObject.dimensionListsMap), currentMessageTime, len(newList), [(k, dimensionsUpdateFrequency[k]) for k in sorted(dimensionsUpdateFrequency)]
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': pprint.pformat(estimationObject.stream_settings),
                           ParameterEstimation.dimensionsUpdateFrequencyId:dimensionsUpdateFrequency
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsUpdateFrequencyFile)
         estimationObject.dimensionListsMap[GeneralMethods.approximateToNearest5Minutes(currentMessageTime)] = newList[:]
         for key in estimationObject.dimensionListsMap.keys()[:]:
             if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[-1]: del estimationObject.dimensionListsMap[key]
Exemplo n.º 14
0
    def analyzeQuality(graphs, graphType):
        def getQualityScore(graphMap, edgesToKeep, timeDifference):
            dataToReturn = []
            for j, intervalInSeconds in enumerate([1]):
                intervalInSeconds*=timeDifference
                linearGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=True, edgesToKeep=edgesToKeep)
                logGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=False, edgesToKeep=edgesToKeep)
                linearClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(linearGraph)[1], key=itemgetter(1)), key=itemgetter(1))]
                logarithmicClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(logGraph)[1], key=itemgetter(1)), key=itemgetter(1))]
                score = LocationGraphs.getClusterQualityScore(linearClusters, logarithmicClusters)
                print intervalInSeconds, edgesToKeep, score
                dataToReturn.append(score)
            return dataToReturn
        graphFile = qualityMetricsFolder%graphType
        print graphFile
        GeneralMethods.runCommand('rm -rf %s'%graphFile)
        for edgesToKeep in range(1,11): 
#        for edgesToKeep in [1,10]: 
            edgesToKeep*=0.1
            graphMap = dict(graphs[:])
            startingGraphId, endingGraphId = min(graphMap.keys()), max(graphMap.keys())
            timeDifference = endingGraphId-startingGraphId
            LocationGraphs.updateLogarithmicGraphs(graphMap, edgesToKeep=edgesToKeep)
#            print {'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))}
            FileIO.writeToFileAsJson({'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))}, graphFile)
Exemplo n.º 15
0
 def run_job_on_hashtags_in_dfs(mr_class, output_file):
     job_conf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000}
     print 'Running map reduce with the following params:'
     pprint(PARAMS_DICT)
     print 'Hadoop job conf:'
     pprint(job_conf)
     runMRJob(mr_class, output_file, [f_hdfs_hashtags], jobconf=job_conf)
     FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
 def writeClusters(hdStreamClusteringObject, currentMessageTime):
     print '\n\n\nEntering:', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
     iterationData = {'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                      'clusters': map(TwitterCrowdsSpecificMethods.getClusterInMapFormat, [cluster for cluster, _ in sorted(StreamCluster.iterateByAttribute(hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)]),
                      'settings': Settings.getSerialzedObject(hdStreamClusteringObject.stream_settings)
                      }
     FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['lsh_clusters_folder']+FileIO.getFileByDay(currentMessageTime))
     print 'Leaving: ', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
 def writeTweetsForDay(currentDay):
     fileName = houston_data_folder+FileIO.getFileByDay(currentDay)
     for tweet in tweets.find({'ca': {'$gt':currentDay, '$lt': currentDay+timedelta(seconds=86399)}}, fields=['ca', 'tx', 'uid']):
         screenName = GenerateHoustonTweetsData.getScreenName(tweet['uid'])
         if screenName!=None: 
             data = {'id': tweet['_id'], 'text': tweet['tx'], 'created_at':getStringRepresentationForTweetTimestamp(tweet['ca']), 'user':{'screen_name': GenerateHoustonTweetsData.getScreenName(tweet['uid'])}}
             FileIO.writeToFileAsJson(data, fileName) 
     os.system('gzip %s'%fileName)
 def generateStatsForDefaultStreamSettings():
     for i in [10**3, 10**4, 10**5]: 
         for j in range(1, 10):
             print 'Generating stats for: ',i*j
             tf = TweetsFile(i*j, **default_experts_twitter_stream_settings)
             FileIO.writeToFileAsJson({'streaming_lsh': tf.generateStatsForStreamingLSHClustering(), 
                                       'settings': Settings.getSerialzedObject(tf.stream_settings)}, 
                                       TweetsFile.default_stats_file)
 def generateStatsForQualityComparisonWithSSA():
     #        for length in [i*j for i in 10**3, 10**4, 10**5 for j in range(1, 10)]:
     for length in [1000000]:
         print "Generating stats for: ", length
         tf = TweetsFile(length, **experts_twitter_stream_settings)
         #            stats = {'ssa': tf.getStatsForSSA(), 'ssa_mr': tf.getStatsForSSAMR(), 'streaming_lsh': KMeansTweetsFile(length, **experts_twitter_stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}
         stats = {"ssa_mr": tf.getStatsForSSAMR(), "settings": Settings.getSerialzedObject(tf.stream_settings)}
         FileIO.writeToFileAsJson(stats, TweetsFile.stats_file)
Exemplo n.º 20
0
def writeCheckinSequenceGraphFile():   
    userSet = set([userVector['user'] for userVector in filteredUserIterator(minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, fullRecord = True)])
    count, total = 1, len(userSet)
    for user in userSet:
        print user, count, total
        checkins = [(c['_id'], c['lid'], time.mktime(c['t'].timetuple())) for c in checkinsCollection.find({'u': user})]
        for i in GeneralMethods.getElementsInWindow(checkins, 2): FileIO.writeToFileAsJson([user, i], checkinSequenceGraphFile)
        count+=1
Exemplo n.º 21
0
def generateLocationClusterData():
#    p = Pool()
#    totalLocations = len(list(locationClusterIterator()))
#    i=1
    for location in locationClusterIterator():
        location = clusterLocation(location)
#        print '%s of %s'%(i,totalLocations)
        FileIO.writeToFileAsJson(location, locationClustersFile)
Exemplo n.º 22
0
 def writeUserClustersFile(place):
     print 'Generating clusters...'
     userVectors = GenerateDataFiles.getUserVectors(place)
     GeneralMethods.runCommand('rm -rf %s'%placesUserClustersFile%place['name'])
     clusterAssignments = Clustering.cluster(Clustering.EM, placesARFFFile%place['name'], userVectors, '-N -1')
 #    clusterAssignments = Clustering.cluster(Clustering.KMeans, placesARFFFile%place['name'], userVectors, '-N 2')
     for userId, userVector in userVectors.iteritems(): userVectors[userId] = {'userVector': userVector, 'clusterId': clusterAssignments[userId]}
     for data in userVectors.iteritems(): FileIO.writeToFileAsJson(data, placesUserClustersFile%place['name'])
Exemplo n.º 23
0
    def run():
        for graphType, method in [\
#                                  (RandomGraphGenerator.fast_gnp_random_graph, RandomGraphGenerator.fastGnp),
#                                  (RandomGraphGenerator.erdos_renyi_graph, RandomGraphGenerator.erdosRenyi),
#                                  (RandomGraphGenerator.newman_watts_strogatz_graph, RandomGraphGenerator.nWS),
                                (RandomGraphGenerator.powerlaw_cluster_graph, RandomGraphGenerator.powerlawClusterGraph),
                                  ]:
            for i in range(1, 11): FileIO.writeToFileAsJson({'n': 100*i, 'graphs': method(1000*i)}, randomGraphsFolder%graphType)
Exemplo n.º 24
0
 def run_job(mr_class, output_file, input_files_start_time, input_files_end_time):
     PARAMS_DICT['input_files_start_time'] = time.mktime(input_files_start_time.timetuple())
     PARAMS_DICT['input_files_end_time'] = time.mktime(input_files_end_time.timetuple())
     print 'Running map reduce with the following params:', pprint(PARAMS_DICT)
     runMRJob(mr_class,
              output_file,
              MRAnalysis.get_input_files_with_tweets(input_files_start_time, input_files_end_time),
              jobconf={'mapred.reduce.tasks':500})
     FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
Exemplo n.º 25
0
 def tweet_stats(input_files_start_time, input_files_end_time):
     mr_class = TweetStats
     output_file = f_tweet_stats
     runMRJob(mr_class,
              output_file,
              getInputFiles(input_files_start_time, input_files_end_time),
              mrJobClassParams = {'job_id': 'as'},
              jobconf={'mapred.reduce.tasks':300})
     FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
 def modifiedClusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime):
     global evaluation, previousTime
     currentTime = time.time()
     documentClusters = [cluster.documentsInCluster.keys() for k, cluster in hdStreamClusteringObject.clusters.iteritems() if len(cluster.documentsInCluster.keys())>=experts_twitter_stream_settings['cluster_filter_threshold']]
     iteration_data = evaluation.getEvaluationMetrics(documentClusters, currentTime-previousTime, {'type': experts_twitter_stream_settings['lsh_type'], 'total_clusters': len(hdStreamClusteringObject.clusters), 'current_time': getStringRepresentationForTweetTimestamp(currentMessageTime)})
     previousTime = time.time()
     FileIO.writeToFileAsJson(iteration_data, JustifyNotUsingVanillaLSH.stats_file)
     del iteration_data['clusters']
     print getStringRepresentationForTweetTimestamp(currentMessageTime), iteration_data
Exemplo n.º 27
0
 def processing_file(self,filename,outputfile):
     for line in open(filename):
         try:
             newline=self.pre_processing_line(line)
             #print newline
             if newline==0:
                 continue
             FileIO.writeToFileAsJson(newline, outputfile)
         except Exception as e:
             print e
    def generateStatsForOptimized():
#        for i in [10**3, 10**4, 10**5]: 
        for length in [1000000, 1100000, 1200000]: 
#        for i in [10**6]:
#            for j in range(1, 10): 
                print 'Generating stats for: ', length
                tf = TweetsFile(length, **experts_twitter_stream_settings)
                FileIO.writeToFileAsJson({'streaming_lsh': tf.generateStatsForHDLSHClustering(), 
                                          'settings': Settings.getSerialzedObject(tf.stream_settings)}, 
                                          hd_clustering_performance_folder+'cda')
    def thresholdForDocumentToBeInCluterEstimation(stats_file, **stream_settings):
        ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value.
        Run this on a document set of size 100K. 
        '''
        for length in [i * j for i in 10 ** 3, 10 ** 4, 10 ** 5 for j in range(1, 10)]: 
#            for t in range(1, 16): 
            for t in range(16, 21):
                stream_settings['threshold_for_document_to_be_in_cluster'] = t * 0.05
                print length, stream_settings['threshold_for_document_to_be_in_cluster']
                stats = {'streaming_lsh': KMeansTweetsFile(length, **stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(stream_settings)}
                FileIO.writeToFileAsJson(stats, stats_file)
Exemplo n.º 30
0
def mr_analysis(startTime, endTime, outputFolder, inputFilesStartTime=None, inputFilesEndTime=None):
    if not inputFilesStartTime: inputFilesStartTime=startTime; inputFilesEndTime=endTime
#    outputFile = hashtagsWithEndingWindowFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
#    outputFile = hashtagsWithoutEndingWindowFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
#    outputFile = hashtagsWithoutEndingWindowWithoutLatticeApproximationFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
#    outputFile = hashtagsAllOccurrencesWithinWindowFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
#    outputFile = timeUnitWithOccurrencesFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
    outputFile = latticeGraphFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
#    outputFile = 'mr_Data/timeUnitWithOccurrences'
    runMRJob(MRAnalysis, outputFile, getInputFiles(inputFilesStartTime, inputFilesEndTime), jobconf={'mapred.reduce.tasks':300})
    FileIO.writeToFileAsJson(PARAMS_DICT, outputFile)
Exemplo n.º 31
0
 def generateStatsForOptimized():
     #        for i in [10**3, 10**4, 10**5]:
     for length in [1000000, 1100000, 1200000]:
         #        for i in [10**6]:
         #            for j in range(1, 10):
         print 'Generating stats for: ', length
         tf = TweetsFile(length, **experts_twitter_stream_settings)
         FileIO.writeToFileAsJson(
             {
                 'streaming_lsh': tf.generateStatsForHDLSHClustering(),
                 'settings': Settings.getSerialzedObject(tf.stream_settings)
             }, hd_clustering_performance_folder + 'cda')
Exemplo n.º 32
0
 def generateStatsForDefaultStreamSettings():
     for i in [10**3, 10**4, 10**5]:
         for j in range(1, 10):
             print 'Generating stats for: ', i * j
             tf = TweetsFile(i * j,
                             **default_experts_twitter_stream_settings)
             FileIO.writeToFileAsJson(
                 {
                     'streaming_lsh':
                     tf.generateStatsForStreamingLSHClustering(),
                     'settings':
                     Settings.getSerialzedObject(tf.stream_settings)
                 }, TweetsFile.default_stats_file)
 def dimensionInActivityTimeEstimation(estimationObject, currentMessageTime):
     phrasesLagDistribution = defaultdict(int)
     for phraseObject in estimationObject.phraseTextToPhraseObjectMap.itervalues():
         lag = DateTimeAirthematic.getDifferenceInTimeUnits(currentMessageTime, phraseObject.latestOccuranceTime, estimationObject.stream_settings['time_unit_in_seconds'].seconds)
         phrasesLagDistribution[str(lag)] += 1
     print currentMessageTime
     iterationData = {
                      'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                      'settings': pprint.pformat(estimationObject.stream_settings),
                      ParameterEstimation.dimensionInActivityTimeId:estimationObject.lagBetweenMessagesDistribution,
                      'phrases_lag_distribution': phrasesLagDistribution
                      }
     FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionInActivityTimeFile)
    def clusterLagDistributionMethod(hdStreamClusteringObject, currentMessageTime):
        lagDistribution = defaultdict(int)
        for cluster in hdStreamClusteringObject.clusters.values():
            lag = DateTimeAirthematic.getDifferenceInTimeUnits(currentMessageTime, cluster.lastStreamAddedTime, hdStreamClusteringObject.stream_settings['time_unit_in_seconds'].seconds)
            lagDistribution[str(lag)] += 1
        print currentMessageTime, len(hdStreamClusteringObject.clusters)
        iterationData = {
                         'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                         'settings': pprint.pformat(hdStreamClusteringObject.stream_settings),
                         ClusteringParametersEstimation.clusterLagDistributionId: lagDistribution,
                         'lag_between_streams_added_to_cluster': hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster']
                         }
#        print hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster']
        FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId])
Exemplo n.º 35
0
    def dimensionsEstimation(estimationObject, currentMessageTime):
        '''
        This class is used to dimensionsEstimation dimensions in the stream. To dimensionsEstimation it we calculate
        the number of phrases that need to added every iteration for different dimensions.
        The dimension at which the number of phrases added stablizes is the number of dimensions
        for the stream.
        
        Why do we need this?
        The aim is to get dimensions, that dont change too often at the same time are not very huge.
        This experiments gives us an approximate idea of the number of dimensions. Randomly picking 
        a small value will result in dimensions that are not good and picking too big a value will 
        result in inefficiency.  
        '''
        def updatePhraseScore(phraseObject):
            phraseObject.updateScore(currentMessageTime, 0,
                                     **estimationObject.stream_settings)
            return phraseObject

        topDimensionsDuringCurrentIteration = [
            p.text
            for p in Phrase.sort((updatePhraseScore(p)
                                  for p in estimationObject.
                                  phraseTextToPhraseObjectMap.itervalues()),
                                 reverse=True)
        ]
        oldList, newList = estimationObject.topDimensionsDuringPreviousIteration, topDimensionsDuringCurrentIteration
        if estimationObject.topDimensionsDuringPreviousIteration:
            dimensions_estimation = {}
            for boundary in estimationObject.boundaries:
                if boundary < len(
                        estimationObject.phraseTextToPhraseObjectMap):
                    dimensions_estimation[str(boundary)] = len(
                        set(newList[:boundary]).difference(oldList[:boundary]))
            print currentMessageTime, len(
                estimationObject.phraseTextToPhraseObjectMap)
            iterationData = {
                'time_stamp':
                getStringRepresentationForTweetTimestamp(currentMessageTime),
                'total_number_of_phrases':
                len(estimationObject.phraseTextToPhraseObjectMap),
                'settings':
                estimationObject.stream_settings.convertToSerializableObject(),
                ParameterEstimation.dimensionsEstimationId:
                dimensions_estimation
            }
            FileIO.writeToFileAsJson(iterationData,
                                     estimationObject.dimensionsEstimationFile)
        estimationObject.topDimensionsDuringPreviousIteration = topDimensionsDuringCurrentIteration[:]
Exemplo n.º 36
0
 def generateStatsForUnOptimized():
     #        for i in [10**3, 10**4, 10**5]:
     for length in [1000000, 1100000, 1200000]:
         #        for i in [10**6]:
         #            for j in range(1, 10):
         print 'Generating stats for: ', length
         #                default_experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod
         tf = TweetsFile(length, **default_experts_twitter_stream_settings)
         performance = tf.generateStatsForHDLSHClustering()
         FileIO.writeToFileAsJson(
             {
                 'streaming_lsh': performance,
                 'settings': Settings.getSerialzedObject(tf.stream_settings)
             }, hd_clustering_performance_folder + 'cda_unopt')
         del performance['clusters']
         print performance
Exemplo n.º 37
0
def getStatsForSSA():
    batchSize = 10000
    default_experts_twitter_stream_settings['ssa_threshold'] = 0.75
    for id in range(21, 50):
        fileName = time_to_process_points + '%s/%s' % (batchSize, id)
        ts = time.time()
        sstObject = SimilarStreamAggregation(
            dict(iterateUserDocuments(fileName)),
            default_experts_twitter_stream_settings['ssa_threshold'])
        sstObject.estimate()
        #    documentClusters = list(sstObject.iterateClusters())
        iteration_data = {
            'iteration_time': time.time() - ts,
            'type': 'ssa',
            'number_of_messages': batchSize * (id + 1),
            'batch_size': batchSize
        }
        FileIO.writeToFileAsJson(iteration_data, ssa_stats_file)
Exemplo n.º 38
0
 def thresholdForDocumentToBeInCluterEstimation(stats_file,
                                                **stream_settings):
     ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value.
     Run this on a document set of size 100K. 
     '''
     for length in [
             i * j for i in 10**3, 10**4, 10**5 for j in range(1, 10)
     ]:
         #            for t in range(1, 16):
         for t in range(16, 21):
             stream_settings[
                 'threshold_for_document_to_be_in_cluster'] = t * 0.05
             print length, stream_settings[
                 'threshold_for_document_to_be_in_cluster']
             stats = {
                 'streaming_lsh':
                 KMeansTweetsFile(length, **stream_settings).
                 generateStatsForStreamingLSHClustering(),
                 'settings':
                 Settings.getSerialzedObject(stream_settings)
             }
             FileIO.writeToFileAsJson(stats, stats_file)
 def modifiedClusterAnalysisMethod(hdStreamClusteringObject,
                                   currentMessageTime):
     global evaluation, previousTime
     currentTime = time.time()
     documentClusters = [
         cluster.documentsInCluster.keys()
         for k, cluster in hdStreamClusteringObject.clusters.iteritems()
         if len(cluster.documentsInCluster.keys()) >=
         experts_twitter_stream_settings['cluster_filter_threshold']
     ]
     iteration_data = evaluation.getEvaluationMetrics(
         documentClusters, currentTime - previousTime, {
             'type':
             experts_twitter_stream_settings['trie_type'],
             'total_clusters':
             len(hdStreamClusteringObject.clusters),
             'current_time':
             getStringRepresentationForTweetTimestamp(currentMessageTime)
         })
     previousTime = time.time()
     FileIO.writeToFileAsJson(iteration_data, JustifyTrie.stats_file)
     del iteration_data['clusters']
     print getStringRepresentationForTweetTimestamp(
         currentMessageTime), iteration_data
Exemplo n.º 40
0
 def writeGraphToFile(graph, fileName): FileIO.writeToFileAsJson(Networkx.getDictForGraph(graph), fileName)
 @staticmethod
Exemplo n.º 41
0
    def dimensionsUpdateFrequencyEstimation(estimationObject,
                                            currentMessageTime):
        '''
        Observe the new dimensions that get added to current dimension if the dimensions 
        are being updated at regular intervals.
        For example, number of dimensions being added after 10m, 20m,... 5 horus. 
        As time increases the number of 'decayed' dimensions increase. The current dimensions
        has a lot of unwanted decayed dimensions. Using this information identify the time 
        interval that is best suited to refresh dimensions. 
        Tentative: We decide to pick the time interval at which the rate of decay is maximum.
        '''
        def updatePhraseScore(phraseObject):
            phraseObject.updateScore(currentMessageTime, 0,
                                     **estimationObject.stream_settings)
            return phraseObject

        dimensions = estimationObject.stream_settings['dimensions']
        newList = [
            p.text
            for p in Phrase.sort((updatePhraseScore(p)
                                  for p in estimationObject.
                                  phraseTextToPhraseObjectMap.itervalues()),
                                 reverse=True)
        ][:dimensions]
        print currentMessageTime, len(newList)
        if len(newList) >= dimensions:
            idsOfDimensionsListToCompare = [
                (i,
                 GeneralMethods.approximateToNearest5Minutes(
                     currentMessageTime - i))
                for i in estimationObject.dimensionUpdateTimeDeltas
                if GeneralMethods.approximateToNearest5Minutes(
                    currentMessageTime -
                    i) in estimationObject.dimensionListsMap
            ]
            dimensionsUpdateFrequency = {}
            for td, id in idsOfDimensionsListToCompare:
                oldList = estimationObject.dimensionListsMap[id]
                dimensionsUpdateFrequency[str(td.seconds)] = len(
                    set(newList).difference(oldList))
            print len(
                estimationObject.dimensionListsMap), currentMessageTime, len(
                    newList), [(k, dimensionsUpdateFrequency[k])
                               for k in sorted(dimensionsUpdateFrequency)]
            iterationData = {
                'time_stamp':
                getStringRepresentationForTweetTimestamp(currentMessageTime),
                'total_number_of_phrases':
                len(estimationObject.phraseTextToPhraseObjectMap),
                'settings':
                pprint.pformat(estimationObject.stream_settings),
                ParameterEstimation.dimensionsUpdateFrequencyId:
                dimensionsUpdateFrequency
            }
            FileIO.writeToFileAsJson(
                iterationData, estimationObject.dimensionsUpdateFrequencyFile)
            estimationObject.dimensionListsMap[
                GeneralMethods.approximateToNearest5Minutes(
                    currentMessageTime)] = newList[:]
            for key in estimationObject.dimensionListsMap.keys()[:]:
                if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[
                        -1]:
                    del estimationObject.dimensionListsMap[key]