def mr_data_analysis(input_files_start_time, input_files_end_time, min_hashtag_occurrences): # output_file = f_tuo_normalized_occurrence_count_and_distribution_value%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tweet_count_stats%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_lid_and_distribution_value%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) output_file = f_tuo_hashtag_and_occurrence_count_and_entropy_and_focus_and_coverage_and_peak%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_rank_and_average_percentage_of_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_iid_and_interval_stats%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_iid_and_perct_change_of_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_normalized_iid_and_tuo_prct_of_occurrences_and_entropy_and_focus_and_coverage%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_hashtag_objects%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_lid_and_ltuo_other_lid_and_temporal_distance%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_lid_and_ltuo_other_lid_and_no_of_co_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_high_accuracy_lid_and_distribution%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_no_of_hashtags_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_no_of_locations_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) # output_file = f_tuo_no_of_peak_lids_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences) print PARAMS_DICT # runMRJob(MRAnalysis, output_file, getInputFiles(input_files_start_time, input_files_end_time), jobconf={'mapred.reduce.tasks':300}) runMRJob(MRAnalysis, output_file, getPreprocessedHashtagsFile(), jobconf={'mapred.reduce.tasks':300}) FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
def model_performance(): runMRJob( ModelPerformance, f_model_performance, [dfs_input], jobconf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000} )
def hashtags_by_models_by_locations(): runMRJob( HashtagsByModelsByLocations, f_hashtags_by_models_by_locations, # [dfs_input], MRAnalysis.get_input_files(), jobconf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000} )
def run_job_on_hashtags_in_dfs(mr_class, output_file): job_conf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000} print 'Running map reduce with the following params:' pprint(PARAMS_DICT) print 'Hadoop job conf:' pprint(job_conf) runMRJob(mr_class, output_file, [f_hdfs_hashtags], jobconf=job_conf) FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
def tweet_stats(input_files_start_time, input_files_end_time): mr_class = TweetStats output_file = f_tweet_stats runMRJob(mr_class, output_file, getInputFiles(input_files_start_time, input_files_end_time), mrJobClassParams = {'job_id': 'as'}, jobconf={'mapred.reduce.tasks':300}) FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
def significant_nei_utm_ids(): input_file = hdfs_input_folder%'generate_data_for_significant_nei_utm_ids'+\ 'generate_data_for_significant_nei_utm_ids.json' print input_file print f_significant_nei_utm_ids runMRJob(SignificantNeirghborUTMIds, f_significant_nei_utm_ids, [input_file], jobconf={'mapred.reduce.tasks':50, 'mapred.task.timeout': 86400000})
def word_object_contingency_table_extractor(): mr_class = WordHashtagContingencyTableObjectExtractor output_file = f_word_hashtag_contigency_table_objects runMRJob( mr_class, output_file, [f_hdfs_hashtags], jobconf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000} )
def tag_cooccurence(inputfile): mr_class = TagCooccur output_file = f_tag_cooccur inputfilelist = [] inputfilelist.append(inputfile) runMRJob(mr_class, output_file, inputfilelist, jobconf={'mapred.reduce.tasks': 10})
def user_tag_dict(inputfile): mr_class = UserTagDict output_file = f_user_tag_dict inputfilelist = [] inputfilelist.append(inputfile) runMRJob(mr_class, output_file, inputfilelist, jobconf = {'mapred.reduce.tasks':10})
def label_cnt(inputfile): mr_class = CountLabel output_file = f_label_cnt inputfilelist = [] inputfilelist.append(inputfile) runMRJob(mr_class, output_file, inputfilelist, )
def get_sp(inputfile): mr_class = SP output_file = f_sp inputfilelist = [] inputfilelist.append(inputfile) runMRJob(mr_class, output_file, inputfilelist, )
def topic_rank(inputfile): mr_class = TopicRank output_file = f_topic_rank inputfilelist = [] inputfilelist.append(inputfile) runMRJob(mr_class, output_file, inputfilelist, jobconf = {'mapred.reduce.tasks':10})
def sift_tag_by_tagger(inputfile): mr_class = SiftTagByTagger output_file = f_sifted_tag_tagger inputfilelist = [] inputfilelist.append(inputfile) runMRJob(mr_class, output_file, inputfilelist, jobconf = {'mapred.reduce.tasks':10})
def tag_cooccurence1(inputfile): mr_class = TagCooccur1 output_file = f_tag_cooccur_1 inputfilelist = [] inputfilelist.append(inputfile) runMRJob(mr_class, output_file, inputfilelist, jobconf = {'mapred.reduce.tasks':10})
def topic_rank(inputfile): mr_class = TopicRank output_file = f_topic_rank inputfilelist = [] inputfilelist.append(inputfile) runMRJob(mr_class, output_file, inputfilelist, jobconf={'mapred.reduce.tasks': 10})
def sift_tag_by_tagger(inputfile): mr_class = SiftTagByTagger output_file = f_sifted_tag_tagger inputfilelist = [] inputfilelist.append(inputfile) runMRJob(mr_class, output_file, inputfilelist, jobconf={'mapred.reduce.tasks': 10})
def true_cosine2(inputfile): mr_class = TrueCosine2 output_file = f_cosine_tag inputfilelist = [] inputfilelist.append(inputfile) runMRJob(mr_class, output_file, inputfilelist, jobconf={'mapred.reduce.tasks': 2})
def zj_list(inputfile): mr_class = ZJ_list output_file = f_zj_in inputfilelist = [] inputfilelist.append(inputfile) runMRJob(mr_class, output_file, inputfilelist, jobconf={'mapred.reduce.tasks': 10})
def word_object_extractor(): mr_class = WordObjectExtractor output_file = f_word_objects_extractor runMRJob( mr_class, output_file, [f_hdfs_hashtags], jobconf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000} )
def true_cosine2(inputfile): mr_class = TrueCosine2 output_file = f_cosine_tag inputfilelist = [] inputfilelist.append(inputfile) runMRJob(mr_class, output_file, inputfilelist, jobconf = {'mapred.reduce.tasks':2})
def zj_list(inputfile): mr_class = ZJ_list output_file = f_zj_in inputfilelist = [] inputfilelist.append(inputfile) runMRJob(mr_class, output_file, inputfilelist, jobconf = {'mapred.reduce.tasks':10})
def run_job(mr_class, output_file, input_files_start_time, input_files_end_time): PARAMS_DICT['input_files_start_time'] = time.mktime(input_files_start_time.timetuple()) PARAMS_DICT['input_files_end_time'] = time.mktime(input_files_end_time.timetuple()) print 'Running map reduce with the following params:', pprint(PARAMS_DICT) runMRJob(mr_class, output_file, MRAnalysis.get_input_files_with_tweets(input_files_start_time, input_files_end_time), jobconf={'mapred.reduce.tasks':500}) FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
def user_tag_dict(inputfile): mr_class = UserTagDict output_file = f_user_tag_dict inputfilelist = [] inputfilelist.append(inputfile) runMRJob(mr_class, output_file, inputfilelist, jobconf={'mapred.reduce.tasks': 10})
def detect(infiles, outfile): mr_class = LangDetectTwitter runMRJob(mr_class, outfile, infiles, mrJobClassParams = {'job_id': 'as'}, args = [], jobconf={'mapred.reduce.tasks':300, 'mapred.task.timeout':86400000} )
def label_cnt(inputfile): mr_class = CountLabel output_file = f_label_cnt inputfilelist = [] inputfilelist.append(inputfile) runMRJob( mr_class, output_file, inputfilelist, )
def get_sp(inputfile): mr_class = SP output_file = f_sp inputfilelist = [] inputfilelist.append(inputfile) runMRJob( mr_class, output_file, inputfilelist, )
def mr_analysis(startTime, endTime, outputFolder, inputFilesStartTime=None, inputFilesEndTime=None): if not inputFilesStartTime: inputFilesStartTime=startTime; inputFilesEndTime=endTime # outputFile = hashtagsWithEndingWindowFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) # outputFile = hashtagsWithoutEndingWindowFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) # outputFile = hashtagsWithoutEndingWindowWithoutLatticeApproximationFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) # outputFile = hashtagsAllOccurrencesWithinWindowFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) # outputFile = timeUnitWithOccurrencesFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) outputFile = latticeGraphFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) # outputFile = 'mr_Data/timeUnitWithOccurrences' runMRJob(MRAnalysis, outputFile, getInputFiles(inputFilesStartTime, inputFilesEndTime), jobconf={'mapred.reduce.tasks':300}) FileIO.writeToFileAsJson(PARAMS_DICT, outputFile)
def filter_tweets(input_files_start_time, input_files_end_time): mr_class = FilterElectionTweets output_file = os.path.expanduser('~/ElectionTweetsAnalysis/data/%s/') % 'results/filter' + 'tweets' input_folder = '/mnt/chevron/bde/Data/TweetData/SampleTweets/2012/' runMRJob(mr_class, output_file, FilterElectionTweetsMRJobRunner.get_dated_input_files(input_files_start_time, input_files_end_time, input_folder), mrJobClassParams = {'job_id': 'as'}, #args = [], jobconf = {'map.reduce.tasks': 300} )
def filter_tweets(input_files): mr_class = FilterTweets output_file = f_local_tweets_filtered runMRJob(mr_class, output_file, input_files, mrJobClassParams = {'job_id': 'as'}, # uncomment when running on local #args = [], jobconf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000} )
def geo_analysis(input_files): mr_class = TweetsGeoAnalysis output_file = f_geo_distrib runMRJob(mr_class, output_file, input_files, mrJobClassParams = {'job_id': 'as'}, # uncomment when running on local #args = [], jobconf={'mapred.reduce.tasks':300, 'mapred.task.timeout': 86400000} )
def tweet_texts(input_files): mr_class = TweetTexts output_file = f_tweet_texts runMRJob(mr_class, output_file, input_files, mrJobClassParams = {'job_id': 'as'}, # uncomment when running on local #args = [], jobconf={'mapred.reduce.tasks':300, 'mapred.task.timeout': 86400000} )
def location_user_pairs(input_files): mr_class = LocationUserPairs output_file = os.path.expanduser('~/LocalExperts/data/results/%s/') % 'local_tweets' + 'locationsbymentions' runMRJob(mr_class, output_file, input_files, mrJobClassParams = {'job_id': 'as'}, # uncomment when running on local #args = [], jobconf={'mapred.reduce.tasks':300, 'mapred.task.timeout': 86400000} )
def user_mention_map(input_files): mr_class = UserMentionTweets output_file = f_mentions runMRJob(mr_class, output_file, input_files, mrJobClassParams = {'job_id': 'as'}, # uncomment when running on local #args = [], jobconf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000} )
def filter_tweets(input_files_start_time, input_files_end_time): mr_class = FilterElectionTweets output_file = os.path.expanduser( '~/ElectionTweetsAnalysis/data/%s/') % 'results/filter' + 'tweets' input_folder = '/mnt/chevron/bde/Data/TweetData/SampleTweets/2012/' runMRJob( mr_class, output_file, FilterElectionTweetsMRJobRunner.get_dated_input_files( input_files_start_time, input_files_end_time, input_folder), mrJobClassParams={'job_id': 'as'}, #args = [], jobconf={'map.reduce.tasks': 300})
def noun_extractor(input_files): mr_class = POSTagger output_file = noun_cloud_output runMRJob(mr_class, output_file, # uncomment when running on local #fs.get_local_input_files(local_tweets_input_folder), input_files, mrJobClassParams = {'job_id': 'as'}, # uncomment when running on local #args = [], jobconf={'mapred.reduce.tasks':300, 'mapred.task.timeout': 86400000} )
def get_users(input_files_start_time, input_files_end_time, input_folder): mr_class = Users output_file = f_users input_files = fs.get_dated_input_files(input_files_start_time, input_files_end_time, input_folder) runMRJob(mr_class, output_file, input_files, mrJobClassParams = {'job_id': 'as'}, # uncomment when running on local #args = [], jobconf={'mapred.reduce.tasks':300, 'mapred.task.timeout': 8640000} )
def mr_analysis(startTime, endTime, outputFolder, inputFilesStartTime=None, inputFilesEndTime=None): if not inputFilesStartTime: inputFilesStartTime = startTime inputFilesEndTime = endTime outputFile = f_hashtag_objects % (outputFolder, startTime.strftime("%Y-%m-%d"), endTime.strftime("%Y-%m-%d")) # outputFile = location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) # outputFile = f_ltuo_location_and_ltuo_hashtag_and_occurrence_time%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d')) runMRJob( MRAnalysis, outputFile, getInputFiles(inputFilesStartTime, inputFilesEndTime), jobconf={"mapred.reduce.tasks": 300}, ) FileIO.writeToFileAsJson(PARAMS_DICT, outputFile)
def count_at_mentions(input_files_start_time, input_files_end_time): mr_class = CountAtMentionTweets output_file = f_count_at_mentions runMRJob(mr_class, output_file, # uncomment when running on local #fs.get_local_input_files(local_tweets_input_folder), fs.get_dated_input_files(input_files_start_time, input_files_end_time, input_folder), mrJobClassParams = {'job_id': 'as'}, # uncomment when running on local #args = [], jobconf={'mapred.reduce.tasks':300, 'mapred.task.timeout': 86400000} )
def get_pins(input_files_start_time, input_files_end_time, input_folder): mr_class = Pins output_file = f_pins chevron_files = fs.get_dated_input_files(input_files_start_time, input_files_end_time, input_folder) ''' hdfs_files = [] for file in chevron_files: hdfs_files = hdfs_rel_path + file ''' runMRJob(mr_class, output_file, chevron_files, mrJobClassParams = {'job_id': 'as'}, # uncomment when running on local #args = [], jobconf={'mapred.reduce.tasks':300, 'mapred.task.timeout': 8640000} )
def get_pins(input_files_start_time, input_files_end_time, input_folder): mr_class = Pins output_file = f_pins chevron_files = fs.get_dated_input_files(input_files_start_time, input_files_end_time, input_folder) ''' hdfs_files = [] for file in chevron_files: hdfs_files = hdfs_rel_path + file ''' runMRJob( mr_class, output_file, chevron_files, mrJobClassParams={'job_id': 'as'}, # uncomment when running on local #args = [], jobconf={ 'mapred.reduce.tasks': 300, 'mapred.task.timeout': 8640000 })
def RunJob(mr_class, outputfile, input_file_start_time, input_file_end_time): runMRJob(mr_class, outputfile, getInputFiles(input_file_start_time, input_file_end_time), jobconf={'mapred.reduce.tasks': 100})