Exemplos de FileIO.writeToFileAsJson em Python, exemplos de library.file_io.FileIO.writeToFileAsJson em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: analysis.py Projeto: kykamath/hashtags_and_geo

    def generate_data_for_significant_nei_utm_ids():
        output_file = GeneralMethods.get_method_id()+'.json'
        so_hashtags, mf_utm_id_to_valid_nei_utm_ids = set(), {}
        for utm_object in \
                FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True):
            for hashtag, count in utm_object['mf_hashtag_to_count'].iteritems():
                if hashtag!='total_num_of_occurrences': so_hashtags.add(hashtag)
            mf_utm_id_to_valid_nei_utm_ids[utm_object['utm_id']] =\
                                                            utm_object['mf_nei_utm_id_to_common_h_count'].keys()
        hashtags = sorted(list(so_hashtags))
        mf_utm_id_to_vector = {}
        for utm_object in FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True):
#                print i, utm_object['utm_id']
            utm_id_vector =  map(lambda hashtag: utm_object['mf_hashtag_to_count'].get(hashtag, 0.0),
                                 hashtags)
            mf_utm_id_to_vector[utm_object['utm_id']] = robjects.FloatVector(utm_id_vector)
        for i, (utm_id, vector) in enumerate(mf_utm_id_to_vector.iteritems()):
            print '%s of %s'%(i+1, len(mf_utm_id_to_vector))
            ltuo_utm_id_and_vector = [(utm_id, vector)]
            for valid_nei_utm_id in mf_utm_id_to_valid_nei_utm_ids[utm_id]:
                if valid_nei_utm_id in mf_utm_id_to_vector and valid_nei_utm_id!=utm_id:
                    ltuo_utm_id_and_vector.append((valid_nei_utm_id, mf_utm_id_to_vector[valid_nei_utm_id]))
            od = rlc.OrdDict(sorted(ltuo_utm_id_and_vector, key=itemgetter(0)))
            df_utm_vectors = robjects.DataFrame(od)
            df_utm_vectors_json = R_Helper.get_json_for_data_frame(df_utm_vectors)
            dfm_dict = cjson.decode(df_utm_vectors_json)
            mf_utm_ids_to_utm_colnames = dict(zip(zip(*ltuo_utm_id_and_vector)[0], df_utm_vectors.colnames))
            utm_id_colname = mf_utm_ids_to_utm_colnames[utm_id]
            dfm_dict['prediction_variable'] = utm_id_colname
            dfm_dict['predictor_variables'] = filter(lambda colname: colname!=utm_id_colname,
                                                     df_utm_vectors.colnames)
            dfm_dict['mf_utm_colnames_to_utm_ids'] = dict(zip(df_utm_vectors.colnames, zip(*ltuo_utm_id_and_vector)[0]))
            FileIO.writeToFileAsJson(dfm_dict, output_file)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: models.py Projeto: kykamath/hashtags_and_geo

 def generate_hashtag_specific_location_and_pure_influence_scores(test_models_ids):
     for test_model_id in test_models_ids:
         output_file = f_ltuo_hashtag_and_ltuo_location_and_pure_influence_score%(test_model_id)
         GeneralMethods.runCommand('rm -rf %s'%output_file)
         ltuo_hashtag_and_ltuo_location_and_occurrence_time = Experiments.load_ltuo_hashtag_and_ltuo_location_and_occurrence_time()
         for hashtag_count, (hashtag, ltuo_location_and_occurrence_time) in\
                 enumerate(ltuo_hashtag_and_ltuo_location_and_occurrence_time):
             ltuo_location_and_occurrence_times = [(location, sorted(zip(*ito_location_and_occurrence_time)[1]))
                                                     for location, ito_location_and_occurrence_time in
                                                         groupby(
                                                                 sorted(ltuo_location_and_occurrence_time, key=itemgetter(0)),
                                                                 key=itemgetter(0)
                                                         )
                                                 ] 
             print hashtag_count, test_model_id
             ltuo_location_and_pure_influence_score = []
             for location, location_occurrence_times in ltuo_location_and_occurrence_times:
                 pure_influence_scores = []
                 for neighbor_location, neighbor_location_occurrence_times in ltuo_location_and_occurrence_times:
                     if location!=neighbor_location:
                         pure_influence_score = MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[test_model_id](neighbor_location_occurrence_times, location_occurrence_times)
                         pure_influence_scores.append(pure_influence_score)
                 ltuo_location_and_pure_influence_score.append([location, np.mean(pure_influence_scores)])
             ltuo_location_and_pure_influence_score = sorted(ltuo_location_and_pure_influence_score, key=itemgetter(1))
             FileIO.writeToFileAsJson([hashtag, ltuo_location_and_pure_influence_score], output_file)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: models_1.py Projeto: kykamath/spam_model

    def trendCurves(iterationData=None, experimentFileName=None):
        if iterationData: 
            currentTimeStep, _, currentTopics, _, finalCall, conf = iterationData
            experimentFileName = conf['experimentFileName']
            if not finalCall:
                topicDistribution = dict((str(topic.id), {'total': topic.totalCount, 'timeStep': topic.countDistribution[currentTimeStep]}) for topic in currentTopics)
#                print currentTimeStep
                FileIO.writeToFileAsJson({'t':currentTimeStep, 'topics':topicDistribution}, experimentFileName)
            else:
                iterationInfo  = {'trending_topics': [topic.id for topic in currentTopics if topic.stickiness>=stickinessLowerThreshold],
                      'topic_colors': dict((str(topic.id), topic.color) for topic in currentTopics),
                      'conf': conf}
                del iterationInfo['conf']['spamDectectionMethod']
                FileIO.writeToFileAsJson(iterationInfo, experimentFileName)
        else:
            topicsDataX = defaultdict(list)
            topicsDataY = defaultdict(list)
            for data in FileIO.iterateJsonFromFile(experimentFileName):
                if 'conf' not in data:
                    for topic in data['topics']: topicsDataX[topic].append(data['t']), topicsDataY[topic].append(data['topics'][topic]['timeStep'])
                else: topicColorMap=data['topic_colors']; trendingTopics=data['trending_topics']
            for topic in topicsDataX: plt.fill_between(topicsDataX[topic], topicsDataY[topic], color=topicColorMap[str(topic)], alpha=1.0)
            plt.figure()
            for topic in trendingTopics: plt.fill_between(topicsDataX[str(topic)], topicsDataY[str(topic)], color=topicColorMap[str(topic)], alpha=1.0)
            plt.ylabel('Number of Contents', fontsize=16, fontweight='bold')
            plt.show()

Exemplo n.º 4

0

Exibir arquivo

Arquivo: models.py Projeto: kykamath/hashtags_and_geo

    def generate_tuo_location_and_tuo_neighbor_location_and_pure_influence_score(models_ids, startTime, endTime, outputFolder, hashtag_tag):
        for model_id in models_ids:
#            if w_extra_hashtags: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag)
#            else: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, wout_extra_hashtags_tag)
            output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag)
            GeneralMethods.runCommand('rm -rf %s'%output_file)
            for line_count, location_object in enumerate(iterateJsonFromFile(
                     location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
                     )):
                print line_count, model_id
                tuo_neighbor_location_and_pure_influence_score = []
                location_hashtag_set = set(location_object['hashtags'])
                for neighbor_location, mf_hashtag_to_tuo_occurrences_and_time_range in location_object['links'].iteritems():
                    pure_influence_scores = []
                    for hashtag, (neighbor_location_occurrences, time_range) in mf_hashtag_to_tuo_occurrences_and_time_range.iteritems():
                        if hashtag in location_object['hashtags']:
                            location_occurrences = location_object['hashtags'][hashtag][0]
                            pure_influence_scores.append(MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[model_id](location_occurrences, neighbor_location_occurrences))
                    neighbor_location_hashtag_set = set(mf_hashtag_to_tuo_occurrences_and_time_range.keys())
                    if hashtag_tag==w_extra_hashtags_tag:
                        for hashtag in location_hashtag_set.difference(neighbor_location_hashtag_set): pure_influence_scores.append(1.0)
                        for hashtag in neighbor_location_hashtag_set.difference(location_hashtag_set): pure_influence_scores.append(-1.0)
                    mean_pure_influence_score = np.mean(pure_influence_scores)
                    tuo_neighbor_location_and_pure_influence_score.append([neighbor_location, mean_pure_influence_score])
                tuo_neighbor_location_and_pure_influence_score = sorted(tuo_neighbor_location_and_pure_influence_score, key=itemgetter(1))
                FileIO.writeToFileAsJson([location_object['id'], tuo_neighbor_location_and_pure_influence_score], output_file)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: spots_radius.py Projeto: kykamath/users_and_geo

def generateRadiusSpots(radiusInMiles):
    graph = nx.Graph()
    spotsFile = radiusSpotsFolder+'%s'%(radiusInMiles)
    print 'Creating:', spotsFile
    for lid in locationIterator():
        for location in nearbyLocations(lid, radiusInMiles): graph.add_edge(location['_id'], lid)
    for locations in nx.connected_components(graph): FileIO.writeToFileAsJson({'venues': locations}, spotsFile)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: models.py Projeto: kykamath/hashtags_and_geo

 def generate_tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity(model_ids, startTime, endTime, outputFolder):
     def location_similarity(location_vector_1, location_vector_2): 
         return reduce(lambda total, k: total+(location_vector_1.get(k,0)*location_vector_2.get(k,0)), set(location_vector_1.keys()).union(location_vector_2.keys()),0.)
     influence_types=[InfluenceMeasuringModels.TYPE_COMPLETE_INFLUENCE, InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE, InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE]
     for model_id in model_ids:
         mf_location_to_mf_influence_type_to_influence_vector = dict(Experiments.load_tuo_location_and_mf_influence_type_to_influence_vector(model_id))
         GeneralMethods.runCommand('rm -rf %s'%tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id)
         for line_count, location_object in enumerate(iterateJsonFromFile(
                      location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
                  )):
             print line_count
             location = location_object['id']
             tuo_neighbor_location_and_mf_influence_type_and_similarity = []
             for neighbor_location in location_object['links'].keys(): 
                 mf_influence_type_and_similarity = {}
                 for influence_type in influence_types:
                     similarity = location_similarity( 
                                                          mf_location_to_mf_influence_type_to_influence_vector[location][influence_type],
                                                          mf_location_to_mf_influence_type_to_influence_vector[neighbor_location][influence_type]
                                                   )
                     mf_influence_type_and_similarity[influence_type] = similarity
                 so_hashtags_for_location = set(location_object['hashtags'].keys())
                 so_hashtags_for_neighbor_location = set(location_object['links'][neighbor_location].keys())
                 numerator = len(so_hashtags_for_location.intersection(so_hashtags_for_neighbor_location)) + 0.
                 denominator = len(so_hashtags_for_location.union(so_hashtags_for_neighbor_location)) + 0.
                 mf_influence_type_and_similarity[JACCARD_SIMILARITY] = numerator/denominator                
                 tuo_neighbor_location_and_mf_influence_type_and_similarity.append([neighbor_location, mf_influence_type_and_similarity])
             FileIO.writeToFileAsJson(
                                      [location, tuo_neighbor_location_and_mf_influence_type_and_similarity],
                                      tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id
                                      )

Exemplo n.º 7

0

Exibir arquivo

Arquivo: quality_comparison_with_kmeans.py Projeto: greeness/hd_streams_clustering

 def generateStatsForMRKMeansClusteringQuality():
     for i in [90000, 100000, 200000, 300000, 400000, 500000]: 
         print 'Generating stats for: ',i
         tf = TweetsFile(i, **experts_twitter_stream_settings)
         FileIO.writeToFileAsJson({'mr_k_means': tf.generateStatsForKMeansMRClustering(), 
                                   'settings': Settings.getSerialzedObject(tf.stream_settings)}, 
                                   TweetsFile.mr_stats_file)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: models.py Projeto: kykamath/spam_model

    def measureRankingQuality(iterationData=None, experimentFileName=None):
        #        def getTopTopics(model, noOfTopics):
        #            topics = set()
        #            topTopics = model.topTopics[:]
        #            while True:
        #                topicIndex = GeneralMethods.weightedChoice([i[1] for i in topTopics])
        #                topic = topTopics[topicIndex][0].id
        #                del topTopics[topicIndex]
        #                if topic not in topics: topics.add(topic)
        #                if len(topics)==noOfTopics or len(topics)==len(model.topTopics): break
        #            return [(t, 0) for t in topics]

        if iterationData:
            currentTimeStep, model, _, _, finalCall, conf = iterationData
            if not finalCall:
                rankingMethods = conf["rankingMethods"]
                experimentFileName = conf["experimentFileName"]
                topTopics = sorted(model.topicsDistributionInTheTimeSet.iteritems(), key=itemgetter(1), reverse=True)[
                    :10
                ]
                #                topTopics = getTopTopics(model, 10)
                #                topTopics = random.sample(sorted(model.topicsDistributionInTheTimeSet.iteritems(), key=itemgetter(1), reverse=True)[:10], min(len(model.topicsDistributionInTheTimeSet),5))
                #                topTopics = random.sample(model.topicsDistributionInTheTimeSet.items(), min(len(model.topicsDistributionInTheTimeSet),5))
                iterationData = {"currentTimeStep": currentTimeStep, "spammmess": defaultdict(list)}
                for rankingMethod in rankingMethods:
                    for queryTopic, _ in topTopics:
                        ranking_id, messages = rankingMethod(queryTopic, model.topicToMessagesMap, **conf)
                        #                        if spammness(messages, norm_k)==0:
                        #                            print 'c'
                        #                        print rankingMethod, spammness(messages, norm_k)
                        iterationData["spammmess"][ranking_id].append(spammness(messages, norm_k))
                #                        print ranking_id, spammness(messages, norm_k)
                FileIO.writeToFileAsJson(iterationData, experimentFileName)
                model.topicsDistributionInTheTimeSet = defaultdict(int)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: quality_comparison_with_kmeans.py Projeto: greeness/hd_streams_clustering

 def generate(self):
     i=0
     for tweet in TwitterIterators.iterateTweetsFromExperts(): 
         FileIO.writeToFileAsJson(tweet, self.fileName)
         i+=1
         if i==self.length: break;
     os.system('gzip %s'%self.fileName)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: algorithms_performance.py Projeto: kykamath/hd_streams_clustering

 def modifiedClusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime):
     global evaluation, previousTime
     currentTime = time.time()
     documentClusters = [
         cluster.documentsInCluster.keys()
         for k, cluster in hdStreamClusteringObject.clusters.iteritems()
         if len(cluster.documentsInCluster.keys()) >= experts_twitter_stream_settings["cluster_filter_threshold"]
     ]
     iteration_data = evaluation.getEvaluationMetrics(
         documentClusters,
         currentTime - previousTime,
         {
             "type": experts_twitter_stream_settings["dimensions_performance_type"],
             "dimensions": experts_twitter_stream_settings["dimensions"],
         },
     )
     iteration_data["no_of_observed_dimensions"] = len(hdStreamClusteringObject.phraseTextToPhraseObjectMap)
     previousTime = time.time()
     FileIO.writeToFileAsJson(iteration_data, JustifyDimensionsEstimation.stats_file)
     del iteration_data["clusters"]
     print currentMessageTime, iteration_data
     if experts_twitter_stream_settings["dimensions"] != 76819 and 2 * experts_twitter_stream_settings[
         "dimensions"
     ] <= len(hdStreamClusteringObject.phraseTextToPhraseObjectMap):
         raise Exception

Exemplo n.º 11

0

Exibir arquivo

Arquivo: analysis.py Projeto: kykamath/hashtags_and_geo

def mr_data_analysis(input_files_start_time, input_files_end_time, min_hashtag_occurrences):
#    output_file = f_tuo_normalized_occurrence_count_and_distribution_value%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tweet_count_stats%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_lid_and_distribution_value%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
    output_file = f_tuo_hashtag_and_occurrence_count_and_entropy_and_focus_and_coverage_and_peak%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_rank_and_average_percentage_of_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

#    output_file = f_tuo_iid_and_interval_stats%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_iid_and_perct_change_of_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_normalized_iid_and_tuo_prct_of_occurrences_and_entropy_and_focus_and_coverage%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

#    output_file = f_hashtag_objects%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

#    output_file = f_tuo_lid_and_ltuo_other_lid_and_temporal_distance%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_lid_and_ltuo_other_lid_and_no_of_co_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_high_accuracy_lid_and_distribution%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_no_of_hashtags_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_no_of_locations_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

#    output_file = f_tuo_no_of_peak_lids_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

    print PARAMS_DICT
#    runMRJob(MRAnalysis, output_file, getInputFiles(input_files_start_time, input_files_end_time), jobconf={'mapred.reduce.tasks':300})
    runMRJob(MRAnalysis, output_file, getPreprocessedHashtagsFile(), jobconf={'mapred.reduce.tasks':300})
    FileIO.writeToFileAsJson(PARAMS_DICT, output_file)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: stream_parameters_estimation.py Projeto: greeness/hd_streams_clustering

 def dimensionsEstimation(estimationObject, currentMessageTime):
     '''
     This class is used to dimensionsEstimation dimensions in the stream. To dimensionsEstimation it we calculate
     the number of phrases that need to added every iteration for different dimensions.
     The dimension at which the number of phrases added stablizes is the number of dimensions
     for the stream.
     
     Why do we need this?
     The aim is to get dimensions, that dont change too often at the same time are not very huge.
     This experiments gives us an approximate idea of the number of dimensions. Randomly picking 
     a small value will result in dimensions that are not good and picking too big a value will 
     result in inefficiency.  
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     topDimensionsDuringCurrentIteration = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)]
     oldList, newList = estimationObject.topDimensionsDuringPreviousIteration, topDimensionsDuringCurrentIteration
     if estimationObject.topDimensionsDuringPreviousIteration:
         dimensions_estimation = {}
         for boundary in estimationObject.boundaries:
             if boundary < len(estimationObject.phraseTextToPhraseObjectMap): dimensions_estimation[str(boundary)] = len(set(newList[:boundary]).difference(oldList[:boundary]))
         print currentMessageTime, len(estimationObject.phraseTextToPhraseObjectMap)
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': estimationObject.stream_settings.convertToSerializableObject(),
                          ParameterEstimation.dimensionsEstimationId:dimensions_estimation
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsEstimationFile)
     estimationObject.topDimensionsDuringPreviousIteration = topDimensionsDuringCurrentIteration[:]

Exemplo n.º 13

0

Exibir arquivo

Arquivo: stream_parameters_estimation.py Projeto: greeness/hd_streams_clustering

 def dimensionsUpdateFrequencyEstimation(estimationObject, currentMessageTime):
     '''
     Observe the new dimensions that get added to current dimension if the dimensions 
     are being updated at regular intervals.
     For example, number of dimensions being added after 10m, 20m,... 5 horus. 
     As time increases the number of 'decayed' dimensions increase. The current dimensions
     has a lot of unwanted decayed dimensions. Using this information identify the time 
     interval that is best suited to refresh dimensions. 
     Tentative: We decide to pick the time interval at which the rate of decay is maximum.
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     dimensions = estimationObject.stream_settings['dimensions']
     newList = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)][:dimensions]
     print currentMessageTime, len(newList)
     if len(newList) >= dimensions:
         idsOfDimensionsListToCompare = [(i, GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i)) for i in estimationObject.dimensionUpdateTimeDeltas if GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i) in estimationObject.dimensionListsMap]
         dimensionsUpdateFrequency = {}
         for td, id in idsOfDimensionsListToCompare:
             oldList = estimationObject.dimensionListsMap[id]
             dimensionsUpdateFrequency[str(td.seconds)] = len(set(newList).difference(oldList))
         print len(estimationObject.dimensionListsMap), currentMessageTime, len(newList), [(k, dimensionsUpdateFrequency[k]) for k in sorted(dimensionsUpdateFrequency)]
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': pprint.pformat(estimationObject.stream_settings),
                           ParameterEstimation.dimensionsUpdateFrequencyId:dimensionsUpdateFrequency
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsUpdateFrequencyFile)
         estimationObject.dimensionListsMap[GeneralMethods.approximateToNearest5Minutes(currentMessageTime)] = newList[:]
         for key in estimationObject.dimensionListsMap.keys()[:]:
             if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[-1]: del estimationObject.dimensionListsMap[key]

Exemplo n.º 14

0

Exibir arquivo

Arquivo: analysis.py Projeto: kykamath/hashtags_and_geo

    def analyzeQuality(graphs, graphType):
        def getQualityScore(graphMap, edgesToKeep, timeDifference):
            dataToReturn = []
            for j, intervalInSeconds in enumerate([1]):
                intervalInSeconds*=timeDifference
                linearGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=True, edgesToKeep=edgesToKeep)
                logGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=False, edgesToKeep=edgesToKeep)
                linearClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(linearGraph)[1], key=itemgetter(1)), key=itemgetter(1))]
                logarithmicClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(logGraph)[1], key=itemgetter(1)), key=itemgetter(1))]
                score = LocationGraphs.getClusterQualityScore(linearClusters, logarithmicClusters)
                print intervalInSeconds, edgesToKeep, score
                dataToReturn.append(score)
            return dataToReturn
        graphFile = qualityMetricsFolder%graphType
        print graphFile
        GeneralMethods.runCommand('rm -rf %s'%graphFile)
        for edgesToKeep in range(1,11): 
#        for edgesToKeep in [1,10]: 
            edgesToKeep*=0.1
            graphMap = dict(graphs[:])
            startingGraphId, endingGraphId = min(graphMap.keys()), max(graphMap.keys())
            timeDifference = endingGraphId-startingGraphId
            LocationGraphs.updateLogarithmicGraphs(graphMap, edgesToKeep=edgesToKeep)
#            print {'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))}
            FileIO.writeToFileAsJson({'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))}, graphFile)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: analysis.py Projeto: kykamath/hashtags_and_geo

 def run_job_on_hashtags_in_dfs(mr_class, output_file):
     job_conf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000}
     print 'Running map reduce with the following params:'
     pprint(PARAMS_DICT)
     print 'Hadoop job conf:'
     pprint(job_conf)
     runMRJob(mr_class, output_file, [f_hdfs_hashtags], jobconf=job_conf)
     FileIO.writeToFileAsJson(PARAMS_DICT, output_file)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: data_generation_and_crowd_analysis.py Projeto: greeness/hd_streams_clustering

 def writeClusters(hdStreamClusteringObject, currentMessageTime):
     print '\n\n\nEntering:', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
     iterationData = {'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                      'clusters': map(TwitterCrowdsSpecificMethods.getClusterInMapFormat, [cluster for cluster, _ in sorted(StreamCluster.iterateByAttribute(hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)]),
                      'settings': Settings.getSerialzedObject(hdStreamClusteringObject.stream_settings)
                      }
     FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['lsh_clusters_folder']+FileIO.getFileByDay(currentMessageTime))
     print 'Leaving: ', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: data_generation_and_crowd_analysis.py Projeto: greeness/hd_streams_clustering

 def writeTweetsForDay(currentDay):
     fileName = houston_data_folder+FileIO.getFileByDay(currentDay)
     for tweet in tweets.find({'ca': {'$gt':currentDay, '$lt': currentDay+timedelta(seconds=86399)}}, fields=['ca', 'tx', 'uid']):
         screenName = GenerateHoustonTweetsData.getScreenName(tweet['uid'])
         if screenName!=None: 
             data = {'id': tweet['_id'], 'text': tweet['tx'], 'created_at':getStringRepresentationForTweetTimestamp(tweet['ca']), 'user':{'screen_name': GenerateHoustonTweetsData.getScreenName(tweet['uid'])}}
             FileIO.writeToFileAsJson(data, fileName) 
     os.system('gzip %s'%fileName)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: quality_comparison_with_kmeans.py Projeto: greeness/hd_streams_clustering

 def generateStatsForDefaultStreamSettings():
     for i in [10**3, 10**4, 10**5]: 
         for j in range(1, 10):
             print 'Generating stats for: ',i*j
             tf = TweetsFile(i*j, **default_experts_twitter_stream_settings)
             FileIO.writeToFileAsJson({'streaming_lsh': tf.generateStatsForStreamingLSHClustering(), 
                                       'settings': Settings.getSerialzedObject(tf.stream_settings)}, 
                                       TweetsFile.default_stats_file)

Exemplo n.º 19

0

Exibir arquivo

Arquivo: quality_comparison_with_ssa.py Projeto: kykamath/hd_streams_clustering

 def generateStatsForQualityComparisonWithSSA():
     #        for length in [i*j for i in 10**3, 10**4, 10**5 for j in range(1, 10)]:
     for length in [1000000]:
         print "Generating stats for: ", length
         tf = TweetsFile(length, **experts_twitter_stream_settings)
         #            stats = {'ssa': tf.getStatsForSSA(), 'ssa_mr': tf.getStatsForSSAMR(), 'streaming_lsh': KMeansTweetsFile(length, **experts_twitter_stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}
         stats = {"ssa_mr": tf.getStatsForSSAMR(), "settings": Settings.getSerialzedObject(tf.stream_settings)}
         FileIO.writeToFileAsJson(stats, TweetsFile.stats_file)

Exemplo n.º 20

0

Exibir arquivo

Arquivo: sequences.py Projeto: kykamath/users_and_geo

def writeCheckinSequenceGraphFile():   
    userSet = set([userVector['user'] for userVector in filteredUserIterator(minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, fullRecord = True)])
    count, total = 1, len(userSet)
    for user in userSet:
        print user, count, total
        checkins = [(c['_id'], c['lid'], time.mktime(c['t'].timetuple())) for c in checkinsCollection.find({'u': user})]
        for i in GeneralMethods.getElementsInWindow(checkins, 2): FileIO.writeToFileAsJson([user, i], checkinSequenceGraphFile)
        count+=1

Exemplo n.º 21

0

Exibir arquivo

Arquivo: user_clustering.py Projeto: kykamath/users_and_geo

def generateLocationClusterData():
#    p = Pool()
#    totalLocations = len(list(locationClusterIterator()))
#    i=1
    for location in locationClusterIterator():
        location = clusterLocation(location)
#        print '%s of %s'%(i,totalLocations)
        FileIO.writeToFileAsJson(location, locationClustersFile)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: places1.py Projeto: kykamath/users_and_geo

 def writeUserClustersFile(place):
     print 'Generating clusters...'
     userVectors = GenerateDataFiles.getUserVectors(place)
     GeneralMethods.runCommand('rm -rf %s'%placesUserClustersFile%place['name'])
     clusterAssignments = Clustering.cluster(Clustering.EM, placesARFFFile%place['name'], userVectors, '-N -1')
 #    clusterAssignments = Clustering.cluster(Clustering.KMeans, placesARFFFile%place['name'], userVectors, '-N 2')
     for userId, userVector in userVectors.iteritems(): userVectors[userId] = {'userVector': userVector, 'clusterId': clusterAssignments[userId]}
     for data in userVectors.iteritems(): FileIO.writeToFileAsJson(data, placesUserClustersFile%place['name'])

Exemplo n.º 23

0

Exibir arquivo

Arquivo: analysis.py Projeto: kykamath/hashtags_and_geo

    def run():
        for graphType, method in [\
#                                  (RandomGraphGenerator.fast_gnp_random_graph, RandomGraphGenerator.fastGnp),
#                                  (RandomGraphGenerator.erdos_renyi_graph, RandomGraphGenerator.erdosRenyi),
#                                  (RandomGraphGenerator.newman_watts_strogatz_graph, RandomGraphGenerator.nWS),
                                (RandomGraphGenerator.powerlaw_cluster_graph, RandomGraphGenerator.powerlawClusterGraph),
                                  ]:
            for i in range(1, 11): FileIO.writeToFileAsJson({'n': 100*i, 'graphs': method(1000*i)}, randomGraphsFolder%graphType)

Exemplo n.º 24

0

Exibir arquivo

Arquivo: analysis_nov_12.py Projeto: kykamath/hashtags_and_geo

 def run_job(mr_class, output_file, input_files_start_time, input_files_end_time):
     PARAMS_DICT['input_files_start_time'] = time.mktime(input_files_start_time.timetuple())
     PARAMS_DICT['input_files_end_time'] = time.mktime(input_files_end_time.timetuple())
     print 'Running map reduce with the following params:', pprint(PARAMS_DICT)
     runMRJob(mr_class,
              output_file,
              MRAnalysis.get_input_files_with_tweets(input_files_start_time, input_files_end_time),
              jobconf={'mapred.reduce.tasks':500})
     FileIO.writeToFileAsJson(PARAMS_DICT, output_file)

Exemplo n.º 25

0

Exibir arquivo

Arquivo: analysis.py Projeto: kykamath/hashtags_and_geo

 def tweet_stats(input_files_start_time, input_files_end_time):
     mr_class = TweetStats
     output_file = f_tweet_stats
     runMRJob(mr_class,
              output_file,
              getInputFiles(input_files_start_time, input_files_end_time),
              mrJobClassParams = {'job_id': 'as'},
              jobconf={'mapred.reduce.tasks':300})
     FileIO.writeToFileAsJson(PARAMS_DICT, output_file)

Exemplo n.º 26

0

Exibir arquivo

Arquivo: algorithms_performance.py Projeto: greeness/hd_streams_clustering

 def modifiedClusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime):
     global evaluation, previousTime
     currentTime = time.time()
     documentClusters = [cluster.documentsInCluster.keys() for k, cluster in hdStreamClusteringObject.clusters.iteritems() if len(cluster.documentsInCluster.keys())>=experts_twitter_stream_settings['cluster_filter_threshold']]
     iteration_data = evaluation.getEvaluationMetrics(documentClusters, currentTime-previousTime, {'type': experts_twitter_stream_settings['lsh_type'], 'total_clusters': len(hdStreamClusteringObject.clusters), 'current_time': getStringRepresentationForTweetTimestamp(currentMessageTime)})
     previousTime = time.time()
     FileIO.writeToFileAsJson(iteration_data, JustifyNotUsingVanillaLSH.stats_file)
     del iteration_data['clusters']
     print getStringRepresentationForTweetTimestamp(currentMessageTime), iteration_data

Exemplo n.º 27

0

Exibir arquivo

Arquivo: preprocessing_with_steming.py Projeto: WeiNiu/lsfolk

 def processing_file(self,filename,outputfile):
     for line in open(filename):
         try:
             newline=self.pre_processing_line(line)
             #print newline
             if newline==0:
                 continue
             FileIO.writeToFileAsJson(newline, outputfile)
         except Exception as e:
             print e

Exemplo n.º 28

0

Exibir arquivo

Arquivo: quality_comparison_with_kmeans.py Projeto: greeness/hd_streams_clustering

    def generateStatsForOptimized():
#        for i in [10**3, 10**4, 10**5]: 
        for length in [1000000, 1100000, 1200000]: 
#        for i in [10**6]:
#            for j in range(1, 10): 
                print 'Generating stats for: ', length
                tf = TweetsFile(length, **experts_twitter_stream_settings)
                FileIO.writeToFileAsJson({'streaming_lsh': tf.generateStatsForHDLSHClustering(), 
                                          'settings': Settings.getSerialzedObject(tf.stream_settings)}, 
                                          hd_clustering_performance_folder+'cda')

Exemplo n.º 29

0

Exibir arquivo

Arquivo: stream_parameters_estimation.py Projeto: greeness/hd_streams_clustering

    def thresholdForDocumentToBeInCluterEstimation(stats_file, **stream_settings):
        ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value.
        Run this on a document set of size 100K. 
        '''
        for length in [i * j for i in 10 ** 3, 10 ** 4, 10 ** 5 for j in range(1, 10)]: 
#            for t in range(1, 16): 
            for t in range(16, 21):
                stream_settings['threshold_for_document_to_be_in_cluster'] = t * 0.05
                print length, stream_settings['threshold_for_document_to_be_in_cluster']
                stats = {'streaming_lsh': KMeansTweetsFile(length, **stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(stream_settings)}
                FileIO.writeToFileAsJson(stats, stats_file)

Exemplo n.º 30

0

Exibir arquivo

Arquivo: analysis.py Projeto: kykamath/hashtags_and_geo

def mr_analysis(startTime, endTime, outputFolder, inputFilesStartTime=None, inputFilesEndTime=None):
    if not inputFilesStartTime: inputFilesStartTime=startTime; inputFilesEndTime=endTime
#    outputFile = hashtagsWithEndingWindowFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
#    outputFile = hashtagsWithoutEndingWindowFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
#    outputFile = hashtagsWithoutEndingWindowWithoutLatticeApproximationFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
#    outputFile = hashtagsAllOccurrencesWithinWindowFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
#    outputFile = timeUnitWithOccurrencesFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
    outputFile = latticeGraphFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
#    outputFile = 'mr_Data/timeUnitWithOccurrences'
    runMRJob(MRAnalysis, outputFile, getInputFiles(inputFilesStartTime, inputFilesEndTime), jobconf={'mapred.reduce.tasks':300})
    FileIO.writeToFileAsJson(PARAMS_DICT, outputFile)

Exemplo n.º 31

0

Exibir arquivo

 def generateStatsForOptimized():
     #        for i in [10**3, 10**4, 10**5]:
     for length in [1000000, 1100000, 1200000]:
         #        for i in [10**6]:
         #            for j in range(1, 10):
         print 'Generating stats for: ', length
         tf = TweetsFile(length, **experts_twitter_stream_settings)
         FileIO.writeToFileAsJson(
             {
                 'streaming_lsh': tf.generateStatsForHDLSHClustering(),
                 'settings': Settings.getSerialzedObject(tf.stream_settings)
             }, hd_clustering_performance_folder + 'cda')

Exemplo n.º 32

0

Exibir arquivo

 def generateStatsForDefaultStreamSettings():
     for i in [10**3, 10**4, 10**5]:
         for j in range(1, 10):
             print 'Generating stats for: ', i * j
             tf = TweetsFile(i * j,
                             **default_experts_twitter_stream_settings)
             FileIO.writeToFileAsJson(
                 {
                     'streaming_lsh':
                     tf.generateStatsForStreamingLSHClustering(),
                     'settings':
                     Settings.getSerialzedObject(tf.stream_settings)
                 }, TweetsFile.default_stats_file)

Exemplo n.º 33

0

Exibir arquivo

Arquivo: stream_parameters_estimation.py Projeto: ylaron/hd_streams_clustering

 def dimensionInActivityTimeEstimation(estimationObject, currentMessageTime):
     phrasesLagDistribution = defaultdict(int)
     for phraseObject in estimationObject.phraseTextToPhraseObjectMap.itervalues():
         lag = DateTimeAirthematic.getDifferenceInTimeUnits(currentMessageTime, phraseObject.latestOccuranceTime, estimationObject.stream_settings['time_unit_in_seconds'].seconds)
         phrasesLagDistribution[str(lag)] += 1
     print currentMessageTime
     iterationData = {
                      'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                      'settings': pprint.pformat(estimationObject.stream_settings),
                      ParameterEstimation.dimensionInActivityTimeId:estimationObject.lagBetweenMessagesDistribution,
                      'phrases_lag_distribution': phrasesLagDistribution
                      }
     FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionInActivityTimeFile)

Exemplo n.º 34

0

Exibir arquivo

Arquivo: stream_parameters_estimation.py Projeto: ylaron/hd_streams_clustering

    def clusterLagDistributionMethod(hdStreamClusteringObject, currentMessageTime):
        lagDistribution = defaultdict(int)
        for cluster in hdStreamClusteringObject.clusters.values():
            lag = DateTimeAirthematic.getDifferenceInTimeUnits(currentMessageTime, cluster.lastStreamAddedTime, hdStreamClusteringObject.stream_settings['time_unit_in_seconds'].seconds)
            lagDistribution[str(lag)] += 1
        print currentMessageTime, len(hdStreamClusteringObject.clusters)
        iterationData = {
                         'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                         'settings': pprint.pformat(hdStreamClusteringObject.stream_settings),
                         ClusteringParametersEstimation.clusterLagDistributionId: lagDistribution,
                         'lag_between_streams_added_to_cluster': hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster']
                         }
#        print hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster']
        FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId])

Exemplo n.º 35

0

Exibir arquivo

    def dimensionsEstimation(estimationObject, currentMessageTime):
        '''
        This class is used to dimensionsEstimation dimensions in the stream. To dimensionsEstimation it we calculate
        the number of phrases that need to added every iteration for different dimensions.
        The dimension at which the number of phrases added stablizes is the number of dimensions
        for the stream.
        
        Why do we need this?
        The aim is to get dimensions, that dont change too often at the same time are not very huge.
        This experiments gives us an approximate idea of the number of dimensions. Randomly picking 
        a small value will result in dimensions that are not good and picking too big a value will 
        result in inefficiency.  
        '''
        def updatePhraseScore(phraseObject):
            phraseObject.updateScore(currentMessageTime, 0,
                                     **estimationObject.stream_settings)
            return phraseObject

        topDimensionsDuringCurrentIteration = [
            p.text
            for p in Phrase.sort((updatePhraseScore(p)
                                  for p in estimationObject.
                                  phraseTextToPhraseObjectMap.itervalues()),
                                 reverse=True)
        ]
        oldList, newList = estimationObject.topDimensionsDuringPreviousIteration, topDimensionsDuringCurrentIteration
        if estimationObject.topDimensionsDuringPreviousIteration:
            dimensions_estimation = {}
            for boundary in estimationObject.boundaries:
                if boundary < len(
                        estimationObject.phraseTextToPhraseObjectMap):
                    dimensions_estimation[str(boundary)] = len(
                        set(newList[:boundary]).difference(oldList[:boundary]))
            print currentMessageTime, len(
                estimationObject.phraseTextToPhraseObjectMap)
            iterationData = {
                'time_stamp':
                getStringRepresentationForTweetTimestamp(currentMessageTime),
                'total_number_of_phrases':
                len(estimationObject.phraseTextToPhraseObjectMap),
                'settings':
                estimationObject.stream_settings.convertToSerializableObject(),
                ParameterEstimation.dimensionsEstimationId:
                dimensions_estimation
            }
            FileIO.writeToFileAsJson(iterationData,
                                     estimationObject.dimensionsEstimationFile)
        estimationObject.topDimensionsDuringPreviousIteration = topDimensionsDuringCurrentIteration[:]

Exemplo n.º 36

0

Exibir arquivo

 def generateStatsForUnOptimized():
     #        for i in [10**3, 10**4, 10**5]:
     for length in [1000000, 1100000, 1200000]:
         #        for i in [10**6]:
         #            for j in range(1, 10):
         print 'Generating stats for: ', length
         #                default_experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod
         tf = TweetsFile(length, **default_experts_twitter_stream_settings)
         performance = tf.generateStatsForHDLSHClustering()
         FileIO.writeToFileAsJson(
             {
                 'streaming_lsh': performance,
                 'settings': Settings.getSerialzedObject(tf.stream_settings)
             }, hd_clustering_performance_folder + 'cda_unopt')
         del performance['clusters']
         print performance

Exemplo n.º 37

0

Exibir arquivo

def getStatsForSSA():
    batchSize = 10000
    default_experts_twitter_stream_settings['ssa_threshold'] = 0.75
    for id in range(21, 50):
        fileName = time_to_process_points + '%s/%s' % (batchSize, id)
        ts = time.time()
        sstObject = SimilarStreamAggregation(
            dict(iterateUserDocuments(fileName)),
            default_experts_twitter_stream_settings['ssa_threshold'])
        sstObject.estimate()
        #    documentClusters = list(sstObject.iterateClusters())
        iteration_data = {
            'iteration_time': time.time() - ts,
            'type': 'ssa',
            'number_of_messages': batchSize * (id + 1),
            'batch_size': batchSize
        }
        FileIO.writeToFileAsJson(iteration_data, ssa_stats_file)

Exemplo n.º 38

0

Exibir arquivo

 def thresholdForDocumentToBeInCluterEstimation(stats_file,
                                                **stream_settings):
     ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value.
     Run this on a document set of size 100K. 
     '''
     for length in [
             i * j for i in 10**3, 10**4, 10**5 for j in range(1, 10)
     ]:
         #            for t in range(1, 16):
         for t in range(16, 21):
             stream_settings[
                 'threshold_for_document_to_be_in_cluster'] = t * 0.05
             print length, stream_settings[
                 'threshold_for_document_to_be_in_cluster']
             stats = {
                 'streaming_lsh':
                 KMeansTweetsFile(length, **stream_settings).
                 generateStatsForStreamingLSHClustering(),
                 'settings':
                 Settings.getSerialzedObject(stream_settings)
             }
             FileIO.writeToFileAsJson(stats, stats_file)

Exemplo n.º 39

0

Exibir arquivo

Arquivo: algorithms_performance.py Projeto: greeness/hd_streams_clustering

 def modifiedClusterAnalysisMethod(hdStreamClusteringObject,
                                   currentMessageTime):
     global evaluation, previousTime
     currentTime = time.time()
     documentClusters = [
         cluster.documentsInCluster.keys()
         for k, cluster in hdStreamClusteringObject.clusters.iteritems()
         if len(cluster.documentsInCluster.keys()) >=
         experts_twitter_stream_settings['cluster_filter_threshold']
     ]
     iteration_data = evaluation.getEvaluationMetrics(
         documentClusters, currentTime - previousTime, {
             'type':
             experts_twitter_stream_settings['trie_type'],
             'total_clusters':
             len(hdStreamClusteringObject.clusters),
             'current_time':
             getStringRepresentationForTweetTimestamp(currentMessageTime)
         })
     previousTime = time.time()
     FileIO.writeToFileAsJson(iteration_data, JustifyTrie.stats_file)
     del iteration_data['clusters']
     print getStringRepresentationForTweetTimestamp(
         currentMessageTime), iteration_data

Exemplo n.º 40

0

Exibir arquivo

 def writeGraphToFile(graph, fileName): FileIO.writeToFileAsJson(Networkx.getDictForGraph(graph), fileName)
 @staticmethod

Exemplo n.º 41

0

Exibir arquivo

    def dimensionsUpdateFrequencyEstimation(estimationObject,
                                            currentMessageTime):
        '''
        Observe the new dimensions that get added to current dimension if the dimensions 
        are being updated at regular intervals.
        For example, number of dimensions being added after 10m, 20m,... 5 horus. 
        As time increases the number of 'decayed' dimensions increase. The current dimensions
        has a lot of unwanted decayed dimensions. Using this information identify the time 
        interval that is best suited to refresh dimensions. 
        Tentative: We decide to pick the time interval at which the rate of decay is maximum.
        '''
        def updatePhraseScore(phraseObject):
            phraseObject.updateScore(currentMessageTime, 0,
                                     **estimationObject.stream_settings)
            return phraseObject

        dimensions = estimationObject.stream_settings['dimensions']
        newList = [
            p.text
            for p in Phrase.sort((updatePhraseScore(p)
                                  for p in estimationObject.
                                  phraseTextToPhraseObjectMap.itervalues()),
                                 reverse=True)
        ][:dimensions]
        print currentMessageTime, len(newList)
        if len(newList) >= dimensions:
            idsOfDimensionsListToCompare = [
                (i,
                 GeneralMethods.approximateToNearest5Minutes(
                     currentMessageTime - i))
                for i in estimationObject.dimensionUpdateTimeDeltas
                if GeneralMethods.approximateToNearest5Minutes(
                    currentMessageTime -
                    i) in estimationObject.dimensionListsMap
            ]
            dimensionsUpdateFrequency = {}
            for td, id in idsOfDimensionsListToCompare:
                oldList = estimationObject.dimensionListsMap[id]
                dimensionsUpdateFrequency[str(td.seconds)] = len(
                    set(newList).difference(oldList))
            print len(
                estimationObject.dimensionListsMap), currentMessageTime, len(
                    newList), [(k, dimensionsUpdateFrequency[k])
                               for k in sorted(dimensionsUpdateFrequency)]
            iterationData = {
                'time_stamp':
                getStringRepresentationForTweetTimestamp(currentMessageTime),
                'total_number_of_phrases':
                len(estimationObject.phraseTextToPhraseObjectMap),
                'settings':
                pprint.pformat(estimationObject.stream_settings),
                ParameterEstimation.dimensionsUpdateFrequencyId:
                dimensionsUpdateFrequency
            }
            FileIO.writeToFileAsJson(
                iterationData, estimationObject.dimensionsUpdateFrequencyFile)
            estimationObject.dimensionListsMap[
                GeneralMethods.approximateToNearest5Minutes(
                    currentMessageTime)] = newList[:]
            for key in estimationObject.dimensionListsMap.keys()[:]:
                if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[
                        -1]:
                    del estimationObject.dimensionListsMap[key]