def generate_data_for_significant_nei_utm_ids(): output_file = GeneralMethods.get_method_id()+'.json' so_hashtags, mf_utm_id_to_valid_nei_utm_ids = set(), {} for utm_object in \ FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True): for hashtag, count in utm_object['mf_hashtag_to_count'].iteritems(): if hashtag!='total_num_of_occurrences': so_hashtags.add(hashtag) mf_utm_id_to_valid_nei_utm_ids[utm_object['utm_id']] =\ utm_object['mf_nei_utm_id_to_common_h_count'].keys() hashtags = sorted(list(so_hashtags)) mf_utm_id_to_vector = {} for utm_object in FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, True): # print i, utm_object['utm_id'] utm_id_vector = map(lambda hashtag: utm_object['mf_hashtag_to_count'].get(hashtag, 0.0), hashtags) mf_utm_id_to_vector[utm_object['utm_id']] = robjects.FloatVector(utm_id_vector) for i, (utm_id, vector) in enumerate(mf_utm_id_to_vector.iteritems()): print '%s of %s'%(i+1, len(mf_utm_id_to_vector)) ltuo_utm_id_and_vector = [(utm_id, vector)] for valid_nei_utm_id in mf_utm_id_to_valid_nei_utm_ids[utm_id]: if valid_nei_utm_id in mf_utm_id_to_vector and valid_nei_utm_id!=utm_id: ltuo_utm_id_and_vector.append((valid_nei_utm_id, mf_utm_id_to_vector[valid_nei_utm_id])) od = rlc.OrdDict(sorted(ltuo_utm_id_and_vector, key=itemgetter(0))) df_utm_vectors = robjects.DataFrame(od) df_utm_vectors_json = R_Helper.get_json_for_data_frame(df_utm_vectors) dfm_dict = cjson.decode(df_utm_vectors_json) mf_utm_ids_to_utm_colnames = dict(zip(zip(*ltuo_utm_id_and_vector)[0], df_utm_vectors.colnames)) utm_id_colname = mf_utm_ids_to_utm_colnames[utm_id] dfm_dict['prediction_variable'] = utm_id_colname dfm_dict['predictor_variables'] = filter(lambda colname: colname!=utm_id_colname, df_utm_vectors.colnames) dfm_dict['mf_utm_colnames_to_utm_ids'] = dict(zip(df_utm_vectors.colnames, zip(*ltuo_utm_id_and_vector)[0])) FileIO.writeToFileAsJson(dfm_dict, output_file)
def generate_hashtag_specific_location_and_pure_influence_scores(test_models_ids): for test_model_id in test_models_ids: output_file = f_ltuo_hashtag_and_ltuo_location_and_pure_influence_score%(test_model_id) GeneralMethods.runCommand('rm -rf %s'%output_file) ltuo_hashtag_and_ltuo_location_and_occurrence_time = Experiments.load_ltuo_hashtag_and_ltuo_location_and_occurrence_time() for hashtag_count, (hashtag, ltuo_location_and_occurrence_time) in\ enumerate(ltuo_hashtag_and_ltuo_location_and_occurrence_time): ltuo_location_and_occurrence_times = [(location, sorted(zip(*ito_location_and_occurrence_time)[1])) for location, ito_location_and_occurrence_time in groupby( sorted(ltuo_location_and_occurrence_time, key=itemgetter(0)), key=itemgetter(0) ) ] print hashtag_count, test_model_id ltuo_location_and_pure_influence_score = [] for location, location_occurrence_times in ltuo_location_and_occurrence_times: pure_influence_scores = [] for neighbor_location, neighbor_location_occurrence_times in ltuo_location_and_occurrence_times: if location!=neighbor_location: pure_influence_score = MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[test_model_id](neighbor_location_occurrence_times, location_occurrence_times) pure_influence_scores.append(pure_influence_score) ltuo_location_and_pure_influence_score.append([location, np.mean(pure_influence_scores)]) ltuo_location_and_pure_influence_score = sorted(ltuo_location_and_pure_influence_score, key=itemgetter(1)) FileIO.writeToFileAsJson([hashtag, ltuo_location_and_pure_influence_score], output_file)
def trendCurves(iterationData=None, experimentFileName=None): if iterationData: currentTimeStep, _, currentTopics, _, finalCall, conf = iterationData experimentFileName = conf['experimentFileName'] if not finalCall: topicDistribution = dict((str(topic.id), {'total': topic.totalCount, 'timeStep': topic.countDistribution[currentTimeStep]}) for topic in currentTopics) # print currentTimeStep FileIO.writeToFileAsJson({'t':currentTimeStep, 'topics':topicDistribution}, experimentFileName) else: iterationInfo = {'trending_topics': [topic.id for topic in currentTopics if topic.stickiness>=stickinessLowerThreshold], 'topic_colors': dict((str(topic.id), topic.color) for topic in currentTopics), 'conf': conf} del iterationInfo['conf']['spamDectectionMethod'] FileIO.writeToFileAsJson(iterationInfo, experimentFileName) else: topicsDataX = defaultdict(list) topicsDataY = defaultdict(list) for data in FileIO.iterateJsonFromFile(experimentFileName): if 'conf' not in data: for topic in data['topics']: topicsDataX[topic].append(data['t']), topicsDataY[topic].append(data['topics'][topic]['timeStep']) else: topicColorMap=data['topic_colors']; trendingTopics=data['trending_topics'] for topic in topicsDataX: plt.fill_between(topicsDataX[topic], topicsDataY[topic], color=topicColorMap[str(topic)], alpha=1.0) plt.figure() for topic in trendingTopics: plt.fill_between(topicsDataX[str(topic)], topicsDataY[str(topic)], color=topicColorMap[str(topic)], alpha=1.0) plt.ylabel('Number of Contents', fontsize=16, fontweight='bold') plt.show()
def generate_tuo_location_and_tuo_neighbor_location_and_pure_influence_score(models_ids, startTime, endTime, outputFolder, hashtag_tag): for model_id in models_ids: # if w_extra_hashtags: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag) # else: output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, wout_extra_hashtags_tag) output_file = tuo_location_and_tuo_neighbor_location_and_pure_influence_score_file%(model_id, hashtag_tag) GeneralMethods.runCommand('rm -rf %s'%output_file) for line_count, location_object in enumerate(iterateJsonFromFile( location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) )): print line_count, model_id tuo_neighbor_location_and_pure_influence_score = [] location_hashtag_set = set(location_object['hashtags']) for neighbor_location, mf_hashtag_to_tuo_occurrences_and_time_range in location_object['links'].iteritems(): pure_influence_scores = [] for hashtag, (neighbor_location_occurrences, time_range) in mf_hashtag_to_tuo_occurrences_and_time_range.iteritems(): if hashtag in location_object['hashtags']: location_occurrences = location_object['hashtags'][hashtag][0] pure_influence_scores.append(MF_INFLUENCE_MEASURING_MODELS_TO_MODEL_ID[model_id](location_occurrences, neighbor_location_occurrences)) neighbor_location_hashtag_set = set(mf_hashtag_to_tuo_occurrences_and_time_range.keys()) if hashtag_tag==w_extra_hashtags_tag: for hashtag in location_hashtag_set.difference(neighbor_location_hashtag_set): pure_influence_scores.append(1.0) for hashtag in neighbor_location_hashtag_set.difference(location_hashtag_set): pure_influence_scores.append(-1.0) mean_pure_influence_score = np.mean(pure_influence_scores) tuo_neighbor_location_and_pure_influence_score.append([neighbor_location, mean_pure_influence_score]) tuo_neighbor_location_and_pure_influence_score = sorted(tuo_neighbor_location_and_pure_influence_score, key=itemgetter(1)) FileIO.writeToFileAsJson([location_object['id'], tuo_neighbor_location_and_pure_influence_score], output_file)
def generateRadiusSpots(radiusInMiles): graph = nx.Graph() spotsFile = radiusSpotsFolder+'%s'%(radiusInMiles) print 'Creating:', spotsFile for lid in locationIterator(): for location in nearbyLocations(lid, radiusInMiles): graph.add_edge(location['_id'], lid) for locations in nx.connected_components(graph): FileIO.writeToFileAsJson({'venues': locations}, spotsFile)
def generate_tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity(model_ids, startTime, endTime, outputFolder): def location_similarity(location_vector_1, location_vector_2): return reduce(lambda total, k: total+(location_vector_1.get(k,0)*location_vector_2.get(k,0)), set(location_vector_1.keys()).union(location_vector_2.keys()),0.) influence_types=[InfluenceMeasuringModels.TYPE_COMPLETE_INFLUENCE, InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE, InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE] for model_id in model_ids: mf_location_to_mf_influence_type_to_influence_vector = dict(Experiments.load_tuo_location_and_mf_influence_type_to_influence_vector(model_id)) GeneralMethods.runCommand('rm -rf %s'%tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id) for line_count, location_object in enumerate(iterateJsonFromFile( location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) )): print line_count location = location_object['id'] tuo_neighbor_location_and_mf_influence_type_and_similarity = [] for neighbor_location in location_object['links'].keys(): mf_influence_type_and_similarity = {} for influence_type in influence_types: similarity = location_similarity( mf_location_to_mf_influence_type_to_influence_vector[location][influence_type], mf_location_to_mf_influence_type_to_influence_vector[neighbor_location][influence_type] ) mf_influence_type_and_similarity[influence_type] = similarity so_hashtags_for_location = set(location_object['hashtags'].keys()) so_hashtags_for_neighbor_location = set(location_object['links'][neighbor_location].keys()) numerator = len(so_hashtags_for_location.intersection(so_hashtags_for_neighbor_location)) + 0. denominator = len(so_hashtags_for_location.union(so_hashtags_for_neighbor_location)) + 0. mf_influence_type_and_similarity[JACCARD_SIMILARITY] = numerator/denominator tuo_neighbor_location_and_mf_influence_type_and_similarity.append([neighbor_location, mf_influence_type_and_similarity]) FileIO.writeToFileAsJson( [location, tuo_neighbor_location_and_mf_influence_type_and_similarity], tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id )
def generateStatsForMRKMeansClusteringQuality(): for i in [90000, 100000, 200000, 300000, 400000, 500000]: print 'Generating stats for: ',i tf = TweetsFile(i, **experts_twitter_stream_settings) FileIO.writeToFileAsJson({'mr_k_means': tf.generateStatsForKMeansMRClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}, TweetsFile.mr_stats_file)
def measureRankingQuality(iterationData=None, experimentFileName=None): # def getTopTopics(model, noOfTopics): # topics = set() # topTopics = model.topTopics[:] # while True: # topicIndex = GeneralMethods.weightedChoice([i[1] for i in topTopics]) # topic = topTopics[topicIndex][0].id # del topTopics[topicIndex] # if topic not in topics: topics.add(topic) # if len(topics)==noOfTopics or len(topics)==len(model.topTopics): break # return [(t, 0) for t in topics] if iterationData: currentTimeStep, model, _, _, finalCall, conf = iterationData if not finalCall: rankingMethods = conf["rankingMethods"] experimentFileName = conf["experimentFileName"] topTopics = sorted(model.topicsDistributionInTheTimeSet.iteritems(), key=itemgetter(1), reverse=True)[ :10 ] # topTopics = getTopTopics(model, 10) # topTopics = random.sample(sorted(model.topicsDistributionInTheTimeSet.iteritems(), key=itemgetter(1), reverse=True)[:10], min(len(model.topicsDistributionInTheTimeSet),5)) # topTopics = random.sample(model.topicsDistributionInTheTimeSet.items(), min(len(model.topicsDistributionInTheTimeSet),5)) iterationData = {"currentTimeStep": currentTimeStep, "spammmess": defaultdict(list)} for rankingMethod in rankingMethods: for queryTopic, _ in topTopics: ranking_id, messages = rankingMethod(queryTopic, model.topicToMessagesMap, **conf) # if spammness(messages, norm_k)==0: # print 'c' # print rankingMethod, spammness(messages, norm_k) iterationData["spammmess"][ranking_id].append(spammness(messages, norm_k)) # print ranking_id, spammness(messages, norm_k) FileIO.writeToFileAsJson(iterationData, experimentFileName) model.topicsDistributionInTheTimeSet = defaultdict(int)
def generate(self): i=0 for tweet in TwitterIterators.iterateTweetsFromExperts(): FileIO.writeToFileAsJson(tweet, self.fileName) i+=1 if i==self.length: break; os.system('gzip %s'%self.fileName)
def modifiedClusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime): global evaluation, previousTime currentTime = time.time() documentClusters = [ cluster.documentsInCluster.keys() for k, cluster in hdStreamClusteringObject.clusters.iteritems() if len(cluster.documentsInCluster.keys()) >= experts_twitter_stream_settings["cluster_filter_threshold"] ] iteration_data = evaluation.getEvaluationMetrics( documentClusters, currentTime - previousTime, { "type": experts_twitter_stream_settings["dimensions_performance_type"], "dimensions": experts_twitter_stream_settings["dimensions"], }, ) iteration_data["no_of_observed_dimensions"] = len(hdStreamClusteringObject.phraseTextToPhraseObjectMap) previousTime = time.time() FileIO.writeToFileAsJson(iteration_data, JustifyDimensionsEstimation.stats_file) del iteration_data["clusters"] print currentMessageTime, iteration_data if experts_twitter_stream_settings["dimensions"] != 76819 and 2 * experts_twitter_stream_settings[ "dimensions" ] <= len(hdStreamClusteringObject.phraseTextToPhraseObjectMap): raise Exception
def mr_data_analysis(input_files_start_time, input_files_end_time, min_hashtag_occurrences): # output_file = f_tuo_normalized_occurrence_count_and_distribution_value%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tweet_count_stats%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_lid_and_distribution_value%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) output_file = f_tuo_hashtag_and_occurrence_count_and_entropy_and_focus_and_coverage_and_peak%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_rank_and_average_percentage_of_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_iid_and_interval_stats%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_iid_and_perct_change_of_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_normalized_iid_and_tuo_prct_of_occurrences_and_entropy_and_focus_and_coverage%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_hashtag_objects%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_lid_and_ltuo_other_lid_and_temporal_distance%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_lid_and_ltuo_other_lid_and_no_of_co_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_high_accuracy_lid_and_distribution%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_no_of_hashtags_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_no_of_locations_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_no_of_peak_lids_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) print PARAMS_DICT # runMRJob(MRAnalysis, output_file, getInputFiles(input_files_start_time, input_files_end_time), jobconf={'mapred.reduce.tasks':300}) runMRJob(MRAnalysis, output_file, getPreprocessedHashtagsFile(), jobconf={'mapred.reduce.tasks':300}) FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
def dimensionsEstimation(estimationObject, currentMessageTime): ''' This class is used to dimensionsEstimation dimensions in the stream. To dimensionsEstimation it we calculate the number of phrases that need to added every iteration for different dimensions. The dimension at which the number of phrases added stablizes is the number of dimensions for the stream. Why do we need this? The aim is to get dimensions, that dont change too often at the same time are not very huge. This experiments gives us an approximate idea of the number of dimensions. Randomly picking a small value will result in dimensions that are not good and picking too big a value will result in inefficiency. ''' def updatePhraseScore(phraseObject): phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings) return phraseObject topDimensionsDuringCurrentIteration = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)] oldList, newList = estimationObject.topDimensionsDuringPreviousIteration, topDimensionsDuringCurrentIteration if estimationObject.topDimensionsDuringPreviousIteration: dimensions_estimation = {} for boundary in estimationObject.boundaries: if boundary < len(estimationObject.phraseTextToPhraseObjectMap): dimensions_estimation[str(boundary)] = len(set(newList[:boundary]).difference(oldList[:boundary])) print currentMessageTime, len(estimationObject.phraseTextToPhraseObjectMap) iterationData = { 'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap), 'settings': estimationObject.stream_settings.convertToSerializableObject(), ParameterEstimation.dimensionsEstimationId:dimensions_estimation } FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsEstimationFile) estimationObject.topDimensionsDuringPreviousIteration = topDimensionsDuringCurrentIteration[:]
def dimensionsUpdateFrequencyEstimation(estimationObject, currentMessageTime): ''' Observe the new dimensions that get added to current dimension if the dimensions are being updated at regular intervals. For example, number of dimensions being added after 10m, 20m,... 5 horus. As time increases the number of 'decayed' dimensions increase. The current dimensions has a lot of unwanted decayed dimensions. Using this information identify the time interval that is best suited to refresh dimensions. Tentative: We decide to pick the time interval at which the rate of decay is maximum. ''' def updatePhraseScore(phraseObject): phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings) return phraseObject dimensions = estimationObject.stream_settings['dimensions'] newList = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)][:dimensions] print currentMessageTime, len(newList) if len(newList) >= dimensions: idsOfDimensionsListToCompare = [(i, GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i)) for i in estimationObject.dimensionUpdateTimeDeltas if GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i) in estimationObject.dimensionListsMap] dimensionsUpdateFrequency = {} for td, id in idsOfDimensionsListToCompare: oldList = estimationObject.dimensionListsMap[id] dimensionsUpdateFrequency[str(td.seconds)] = len(set(newList).difference(oldList)) print len(estimationObject.dimensionListsMap), currentMessageTime, len(newList), [(k, dimensionsUpdateFrequency[k]) for k in sorted(dimensionsUpdateFrequency)] iterationData = { 'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap), 'settings': pprint.pformat(estimationObject.stream_settings), ParameterEstimation.dimensionsUpdateFrequencyId:dimensionsUpdateFrequency } FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsUpdateFrequencyFile) estimationObject.dimensionListsMap[GeneralMethods.approximateToNearest5Minutes(currentMessageTime)] = newList[:] for key in estimationObject.dimensionListsMap.keys()[:]: if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[-1]: del estimationObject.dimensionListsMap[key]
def analyzeQuality(graphs, graphType): def getQualityScore(graphMap, edgesToKeep, timeDifference): dataToReturn = [] for j, intervalInSeconds in enumerate([1]): intervalInSeconds*=timeDifference linearGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=True, edgesToKeep=edgesToKeep) logGraph = LocationGraphs.combineLocationGraphs(graphMap, startingGraphId, datetime.datetime.fromtimestamp(endingGraphId+1), intervalInSeconds, linear=False, edgesToKeep=edgesToKeep) linearClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(linearGraph)[1], key=itemgetter(1)), key=itemgetter(1))] logarithmicClusters = [[str(c), [l[0]for l in lst]] for c, lst in groupby(sorted(clusterUsingAffinityPropagation(logGraph)[1], key=itemgetter(1)), key=itemgetter(1))] score = LocationGraphs.getClusterQualityScore(linearClusters, logarithmicClusters) print intervalInSeconds, edgesToKeep, score dataToReturn.append(score) return dataToReturn graphFile = qualityMetricsFolder%graphType print graphFile GeneralMethods.runCommand('rm -rf %s'%graphFile) for edgesToKeep in range(1,11): # for edgesToKeep in [1,10]: edgesToKeep*=0.1 graphMap = dict(graphs[:]) startingGraphId, endingGraphId = min(graphMap.keys()), max(graphMap.keys()) timeDifference = endingGraphId-startingGraphId LocationGraphs.updateLogarithmicGraphs(graphMap, edgesToKeep=edgesToKeep) # print {'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))} FileIO.writeToFileAsJson({'edgesToKeep': edgesToKeep, 'score': np.mean(getQualityScore(graphMap, edgesToKeep, timeDifference))}, graphFile)
def run_job_on_hashtags_in_dfs(mr_class, output_file): job_conf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000} print 'Running map reduce with the following params:' pprint(PARAMS_DICT) print 'Hadoop job conf:' pprint(job_conf) runMRJob(mr_class, output_file, [f_hdfs_hashtags], jobconf=job_conf) FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
def writeClusters(hdStreamClusteringObject, currentMessageTime): print '\n\n\nEntering:', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters) iterationData = {'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'clusters': map(TwitterCrowdsSpecificMethods.getClusterInMapFormat, [cluster for cluster, _ in sorted(StreamCluster.iterateByAttribute(hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)]), 'settings': Settings.getSerialzedObject(hdStreamClusteringObject.stream_settings) } FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['lsh_clusters_folder']+FileIO.getFileByDay(currentMessageTime)) print 'Leaving: ', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
def writeTweetsForDay(currentDay): fileName = houston_data_folder+FileIO.getFileByDay(currentDay) for tweet in tweets.find({'ca': {'$gt':currentDay, '$lt': currentDay+timedelta(seconds=86399)}}, fields=['ca', 'tx', 'uid']): screenName = GenerateHoustonTweetsData.getScreenName(tweet['uid']) if screenName!=None: data = {'id': tweet['_id'], 'text': tweet['tx'], 'created_at':getStringRepresentationForTweetTimestamp(tweet['ca']), 'user':{'screen_name': GenerateHoustonTweetsData.getScreenName(tweet['uid'])}} FileIO.writeToFileAsJson(data, fileName) os.system('gzip %s'%fileName)
def generateStatsForDefaultStreamSettings(): for i in [10**3, 10**4, 10**5]: for j in range(1, 10): print 'Generating stats for: ',i*j tf = TweetsFile(i*j, **default_experts_twitter_stream_settings) FileIO.writeToFileAsJson({'streaming_lsh': tf.generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}, TweetsFile.default_stats_file)
def generateStatsForQualityComparisonWithSSA(): # for length in [i*j for i in 10**3, 10**4, 10**5 for j in range(1, 10)]: for length in [1000000]: print "Generating stats for: ", length tf = TweetsFile(length, **experts_twitter_stream_settings) # stats = {'ssa': tf.getStatsForSSA(), 'ssa_mr': tf.getStatsForSSAMR(), 'streaming_lsh': KMeansTweetsFile(length, **experts_twitter_stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)} stats = {"ssa_mr": tf.getStatsForSSAMR(), "settings": Settings.getSerialzedObject(tf.stream_settings)} FileIO.writeToFileAsJson(stats, TweetsFile.stats_file)
def writeCheckinSequenceGraphFile(): userSet = set([userVector['user'] for userVector in filteredUserIterator(minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, fullRecord = True)]) count, total = 1, len(userSet) for user in userSet: print user, count, total checkins = [(c['_id'], c['lid'], time.mktime(c['t'].timetuple())) for c in checkinsCollection.find({'u': user})] for i in GeneralMethods.getElementsInWindow(checkins, 2): FileIO.writeToFileAsJson([user, i], checkinSequenceGraphFile) count+=1
def generateLocationClusterData(): # p = Pool() # totalLocations = len(list(locationClusterIterator())) # i=1 for location in locationClusterIterator(): location = clusterLocation(location) # print '%s of %s'%(i,totalLocations) FileIO.writeToFileAsJson(location, locationClustersFile)
def writeUserClustersFile(place): print 'Generating clusters...' userVectors = GenerateDataFiles.getUserVectors(place) GeneralMethods.runCommand('rm -rf %s'%placesUserClustersFile%place['name']) clusterAssignments = Clustering.cluster(Clustering.EM, placesARFFFile%place['name'], userVectors, '-N -1') # clusterAssignments = Clustering.cluster(Clustering.KMeans, placesARFFFile%place['name'], userVectors, '-N 2') for userId, userVector in userVectors.iteritems(): userVectors[userId] = {'userVector': userVector, 'clusterId': clusterAssignments[userId]} for data in userVectors.iteritems(): FileIO.writeToFileAsJson(data, placesUserClustersFile%place['name'])
def run(): for graphType, method in [\ # (RandomGraphGenerator.fast_gnp_random_graph, RandomGraphGenerator.fastGnp), # (RandomGraphGenerator.erdos_renyi_graph, RandomGraphGenerator.erdosRenyi), # (RandomGraphGenerator.newman_watts_strogatz_graph, RandomGraphGenerator.nWS), (RandomGraphGenerator.powerlaw_cluster_graph, RandomGraphGenerator.powerlawClusterGraph), ]: for i in range(1, 11): FileIO.writeToFileAsJson({'n': 100*i, 'graphs': method(1000*i)}, randomGraphsFolder%graphType)
def run_job(mr_class, output_file, input_files_start_time, input_files_end_time): PARAMS_DICT['input_files_start_time'] = time.mktime(input_files_start_time.timetuple()) PARAMS_DICT['input_files_end_time'] = time.mktime(input_files_end_time.timetuple()) print 'Running map reduce with the following params:', pprint(PARAMS_DICT) runMRJob(mr_class, output_file, MRAnalysis.get_input_files_with_tweets(input_files_start_time, input_files_end_time), jobconf={'mapred.reduce.tasks':500}) FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
def tweet_stats(input_files_start_time, input_files_end_time): mr_class = TweetStats output_file = f_tweet_stats runMRJob(mr_class, output_file, getInputFiles(input_files_start_time, input_files_end_time), mrJobClassParams = {'job_id': 'as'}, jobconf={'mapred.reduce.tasks':300}) FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
def modifiedClusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime): global evaluation, previousTime currentTime = time.time() documentClusters = [cluster.documentsInCluster.keys() for k, cluster in hdStreamClusteringObject.clusters.iteritems() if len(cluster.documentsInCluster.keys())>=experts_twitter_stream_settings['cluster_filter_threshold']] iteration_data = evaluation.getEvaluationMetrics(documentClusters, currentTime-previousTime, {'type': experts_twitter_stream_settings['lsh_type'], 'total_clusters': len(hdStreamClusteringObject.clusters), 'current_time': getStringRepresentationForTweetTimestamp(currentMessageTime)}) previousTime = time.time() FileIO.writeToFileAsJson(iteration_data, JustifyNotUsingVanillaLSH.stats_file) del iteration_data['clusters'] print getStringRepresentationForTweetTimestamp(currentMessageTime), iteration_data
def processing_file(self,filename,outputfile): for line in open(filename): try: newline=self.pre_processing_line(line) #print newline if newline==0: continue FileIO.writeToFileAsJson(newline, outputfile) except Exception as e: print e
def generateStatsForOptimized(): # for i in [10**3, 10**4, 10**5]: for length in [1000000, 1100000, 1200000]: # for i in [10**6]: # for j in range(1, 10): print 'Generating stats for: ', length tf = TweetsFile(length, **experts_twitter_stream_settings) FileIO.writeToFileAsJson({'streaming_lsh': tf.generateStatsForHDLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings)}, hd_clustering_performance_folder+'cda')
def thresholdForDocumentToBeInCluterEstimation(stats_file, **stream_settings): ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value. Run this on a document set of size 100K. ''' for length in [i * j for i in 10 ** 3, 10 ** 4, 10 ** 5 for j in range(1, 10)]: # for t in range(1, 16): for t in range(16, 21): stream_settings['threshold_for_document_to_be_in_cluster'] = t * 0.05 print length, stream_settings['threshold_for_document_to_be_in_cluster'] stats = {'streaming_lsh': KMeansTweetsFile(length, **stream_settings).generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(stream_settings)} FileIO.writeToFileAsJson(stats, stats_file)
def mr_analysis(startTime, endTime, outputFolder, inputFilesStartTime=None, inputFilesEndTime=None): if not inputFilesStartTime: inputFilesStartTime=startTime; inputFilesEndTime=endTime # outputFile = hashtagsWithEndingWindowFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) # outputFile = hashtagsWithoutEndingWindowFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) # outputFile = hashtagsWithoutEndingWindowWithoutLatticeApproximationFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) # outputFile = hashtagsAllOccurrencesWithinWindowFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) # outputFile = timeUnitWithOccurrencesFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) outputFile = latticeGraphFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) # outputFile = 'mr_Data/timeUnitWithOccurrences' runMRJob(MRAnalysis, outputFile, getInputFiles(inputFilesStartTime, inputFilesEndTime), jobconf={'mapred.reduce.tasks':300}) FileIO.writeToFileAsJson(PARAMS_DICT, outputFile)
def generateStatsForOptimized(): # for i in [10**3, 10**4, 10**5]: for length in [1000000, 1100000, 1200000]: # for i in [10**6]: # for j in range(1, 10): print 'Generating stats for: ', length tf = TweetsFile(length, **experts_twitter_stream_settings) FileIO.writeToFileAsJson( { 'streaming_lsh': tf.generateStatsForHDLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings) }, hd_clustering_performance_folder + 'cda')
def generateStatsForDefaultStreamSettings(): for i in [10**3, 10**4, 10**5]: for j in range(1, 10): print 'Generating stats for: ', i * j tf = TweetsFile(i * j, **default_experts_twitter_stream_settings) FileIO.writeToFileAsJson( { 'streaming_lsh': tf.generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(tf.stream_settings) }, TweetsFile.default_stats_file)
def dimensionInActivityTimeEstimation(estimationObject, currentMessageTime): phrasesLagDistribution = defaultdict(int) for phraseObject in estimationObject.phraseTextToPhraseObjectMap.itervalues(): lag = DateTimeAirthematic.getDifferenceInTimeUnits(currentMessageTime, phraseObject.latestOccuranceTime, estimationObject.stream_settings['time_unit_in_seconds'].seconds) phrasesLagDistribution[str(lag)] += 1 print currentMessageTime iterationData = { 'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'settings': pprint.pformat(estimationObject.stream_settings), ParameterEstimation.dimensionInActivityTimeId:estimationObject.lagBetweenMessagesDistribution, 'phrases_lag_distribution': phrasesLagDistribution } FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionInActivityTimeFile)
def clusterLagDistributionMethod(hdStreamClusteringObject, currentMessageTime): lagDistribution = defaultdict(int) for cluster in hdStreamClusteringObject.clusters.values(): lag = DateTimeAirthematic.getDifferenceInTimeUnits(currentMessageTime, cluster.lastStreamAddedTime, hdStreamClusteringObject.stream_settings['time_unit_in_seconds'].seconds) lagDistribution[str(lag)] += 1 print currentMessageTime, len(hdStreamClusteringObject.clusters) iterationData = { 'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'settings': pprint.pformat(hdStreamClusteringObject.stream_settings), ClusteringParametersEstimation.clusterLagDistributionId: lagDistribution, 'lag_between_streams_added_to_cluster': hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster'] } # print hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster'] FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId])
def dimensionsEstimation(estimationObject, currentMessageTime): ''' This class is used to dimensionsEstimation dimensions in the stream. To dimensionsEstimation it we calculate the number of phrases that need to added every iteration for different dimensions. The dimension at which the number of phrases added stablizes is the number of dimensions for the stream. Why do we need this? The aim is to get dimensions, that dont change too often at the same time are not very huge. This experiments gives us an approximate idea of the number of dimensions. Randomly picking a small value will result in dimensions that are not good and picking too big a value will result in inefficiency. ''' def updatePhraseScore(phraseObject): phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings) return phraseObject topDimensionsDuringCurrentIteration = [ p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject. phraseTextToPhraseObjectMap.itervalues()), reverse=True) ] oldList, newList = estimationObject.topDimensionsDuringPreviousIteration, topDimensionsDuringCurrentIteration if estimationObject.topDimensionsDuringPreviousIteration: dimensions_estimation = {} for boundary in estimationObject.boundaries: if boundary < len( estimationObject.phraseTextToPhraseObjectMap): dimensions_estimation[str(boundary)] = len( set(newList[:boundary]).difference(oldList[:boundary])) print currentMessageTime, len( estimationObject.phraseTextToPhraseObjectMap) iterationData = { 'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap), 'settings': estimationObject.stream_settings.convertToSerializableObject(), ParameterEstimation.dimensionsEstimationId: dimensions_estimation } FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsEstimationFile) estimationObject.topDimensionsDuringPreviousIteration = topDimensionsDuringCurrentIteration[:]
def generateStatsForUnOptimized(): # for i in [10**3, 10**4, 10**5]: for length in [1000000, 1100000, 1200000]: # for i in [10**6]: # for j in range(1, 10): print 'Generating stats for: ', length # default_experts_twitter_stream_settings['cluster_filtering_method'] = emptyClusterFilteringMethod tf = TweetsFile(length, **default_experts_twitter_stream_settings) performance = tf.generateStatsForHDLSHClustering() FileIO.writeToFileAsJson( { 'streaming_lsh': performance, 'settings': Settings.getSerialzedObject(tf.stream_settings) }, hd_clustering_performance_folder + 'cda_unopt') del performance['clusters'] print performance
def getStatsForSSA(): batchSize = 10000 default_experts_twitter_stream_settings['ssa_threshold'] = 0.75 for id in range(21, 50): fileName = time_to_process_points + '%s/%s' % (batchSize, id) ts = time.time() sstObject = SimilarStreamAggregation( dict(iterateUserDocuments(fileName)), default_experts_twitter_stream_settings['ssa_threshold']) sstObject.estimate() # documentClusters = list(sstObject.iterateClusters()) iteration_data = { 'iteration_time': time.time() - ts, 'type': 'ssa', 'number_of_messages': batchSize * (id + 1), 'batch_size': batchSize } FileIO.writeToFileAsJson(iteration_data, ssa_stats_file)
def thresholdForDocumentToBeInCluterEstimation(stats_file, **stream_settings): ''' Estimate thresold for the clusters by varying the threshold_for_document_to_be_in_cluster value. Run this on a document set of size 100K. ''' for length in [ i * j for i in 10**3, 10**4, 10**5 for j in range(1, 10) ]: # for t in range(1, 16): for t in range(16, 21): stream_settings[ 'threshold_for_document_to_be_in_cluster'] = t * 0.05 print length, stream_settings[ 'threshold_for_document_to_be_in_cluster'] stats = { 'streaming_lsh': KMeansTweetsFile(length, **stream_settings). generateStatsForStreamingLSHClustering(), 'settings': Settings.getSerialzedObject(stream_settings) } FileIO.writeToFileAsJson(stats, stats_file)
def modifiedClusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime): global evaluation, previousTime currentTime = time.time() documentClusters = [ cluster.documentsInCluster.keys() for k, cluster in hdStreamClusteringObject.clusters.iteritems() if len(cluster.documentsInCluster.keys()) >= experts_twitter_stream_settings['cluster_filter_threshold'] ] iteration_data = evaluation.getEvaluationMetrics( documentClusters, currentTime - previousTime, { 'type': experts_twitter_stream_settings['trie_type'], 'total_clusters': len(hdStreamClusteringObject.clusters), 'current_time': getStringRepresentationForTweetTimestamp(currentMessageTime) }) previousTime = time.time() FileIO.writeToFileAsJson(iteration_data, JustifyTrie.stats_file) del iteration_data['clusters'] print getStringRepresentationForTweetTimestamp( currentMessageTime), iteration_data
def writeGraphToFile(graph, fileName): FileIO.writeToFileAsJson(Networkx.getDictForGraph(graph), fileName) @staticmethod
def dimensionsUpdateFrequencyEstimation(estimationObject, currentMessageTime): ''' Observe the new dimensions that get added to current dimension if the dimensions are being updated at regular intervals. For example, number of dimensions being added after 10m, 20m,... 5 horus. As time increases the number of 'decayed' dimensions increase. The current dimensions has a lot of unwanted decayed dimensions. Using this information identify the time interval that is best suited to refresh dimensions. Tentative: We decide to pick the time interval at which the rate of decay is maximum. ''' def updatePhraseScore(phraseObject): phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings) return phraseObject dimensions = estimationObject.stream_settings['dimensions'] newList = [ p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject. phraseTextToPhraseObjectMap.itervalues()), reverse=True) ][:dimensions] print currentMessageTime, len(newList) if len(newList) >= dimensions: idsOfDimensionsListToCompare = [ (i, GeneralMethods.approximateToNearest5Minutes( currentMessageTime - i)) for i in estimationObject.dimensionUpdateTimeDeltas if GeneralMethods.approximateToNearest5Minutes( currentMessageTime - i) in estimationObject.dimensionListsMap ] dimensionsUpdateFrequency = {} for td, id in idsOfDimensionsListToCompare: oldList = estimationObject.dimensionListsMap[id] dimensionsUpdateFrequency[str(td.seconds)] = len( set(newList).difference(oldList)) print len( estimationObject.dimensionListsMap), currentMessageTime, len( newList), [(k, dimensionsUpdateFrequency[k]) for k in sorted(dimensionsUpdateFrequency)] iterationData = { 'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap), 'settings': pprint.pformat(estimationObject.stream_settings), ParameterEstimation.dimensionsUpdateFrequencyId: dimensionsUpdateFrequency } FileIO.writeToFileAsJson( iterationData, estimationObject.dimensionsUpdateFrequencyFile) estimationObject.dimensionListsMap[ GeneralMethods.approximateToNearest5Minutes( currentMessageTime)] = newList[:] for key in estimationObject.dimensionListsMap.keys()[:]: if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[ -1]: del estimationObject.dimensionListsMap[key]