예제 #1
0
def mr_data_analysis(input_files_start_time, input_files_end_time, min_hashtag_occurrences):
#    output_file = f_tuo_normalized_occurrence_count_and_distribution_value%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tweet_count_stats%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_lid_and_distribution_value%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
    output_file = f_tuo_hashtag_and_occurrence_count_and_entropy_and_focus_and_coverage_and_peak%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_rank_and_average_percentage_of_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

#    output_file = f_tuo_iid_and_interval_stats%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_iid_and_perct_change_of_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_normalized_iid_and_tuo_prct_of_occurrences_and_entropy_and_focus_and_coverage%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

#    output_file = f_hashtag_objects%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

#    output_file = f_tuo_lid_and_ltuo_other_lid_and_temporal_distance%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_lid_and_ltuo_other_lid_and_no_of_co_occurrences%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_high_accuracy_lid_and_distribution%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_no_of_hashtags_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)
#    output_file = f_tuo_no_of_locations_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

#    output_file = f_tuo_no_of_peak_lids_and_count%(input_files_start_time.strftime('%Y-%m-%d'), input_files_end_time.strftime('%Y-%m-%d'), min_hashtag_occurrences)

    print PARAMS_DICT
#    runMRJob(MRAnalysis, output_file, getInputFiles(input_files_start_time, input_files_end_time), jobconf={'mapred.reduce.tasks':300})
    runMRJob(MRAnalysis, output_file, getPreprocessedHashtagsFile(), jobconf={'mapred.reduce.tasks':300})
    FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
 def model_performance():
     runMRJob(
                 ModelPerformance,
                 f_model_performance,
                 [dfs_input],
                 jobconf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000}
              )
    def hashtags_by_models_by_locations():
        runMRJob(
                    HashtagsByModelsByLocations,
                    f_hashtags_by_models_by_locations,
#                     [dfs_input],
                    MRAnalysis.get_input_files(),
                    jobconf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000}
                 )
예제 #4
0
 def run_job_on_hashtags_in_dfs(mr_class, output_file):
     job_conf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000}
     print 'Running map reduce with the following params:'
     pprint(PARAMS_DICT)
     print 'Hadoop job conf:'
     pprint(job_conf)
     runMRJob(mr_class, output_file, [f_hdfs_hashtags], jobconf=job_conf)
     FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
예제 #5
0
 def tweet_stats(input_files_start_time, input_files_end_time):
     mr_class = TweetStats
     output_file = f_tweet_stats
     runMRJob(mr_class,
              output_file,
              getInputFiles(input_files_start_time, input_files_end_time),
              mrJobClassParams = {'job_id': 'as'},
              jobconf={'mapred.reduce.tasks':300})
     FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
예제 #6
0
 def significant_nei_utm_ids():
     input_file = hdfs_input_folder%'generate_data_for_significant_nei_utm_ids'+\
                                                             'generate_data_for_significant_nei_utm_ids.json'
     print input_file
     print f_significant_nei_utm_ids
     runMRJob(SignificantNeirghborUTMIds,
              f_significant_nei_utm_ids,
              [input_file],
              jobconf={'mapred.reduce.tasks':50, 'mapred.task.timeout': 86400000})
예제 #7
0
 def word_object_contingency_table_extractor():
     mr_class = WordHashtagContingencyTableObjectExtractor
     output_file = f_word_hashtag_contigency_table_objects
     runMRJob(
                  mr_class,
                  output_file,
                  [f_hdfs_hashtags],
                  jobconf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000}
              )
예제 #8
0
파일: analysis.py 프로젝트: WeiNiu/lsfolk
 def tag_cooccurence(inputfile):
     mr_class = TagCooccur
     output_file = f_tag_cooccur
     inputfilelist = []
     inputfilelist.append(inputfile)
     runMRJob(mr_class,
              output_file,
              inputfilelist,
              jobconf={'mapred.reduce.tasks': 10})
예제 #9
0
파일: analysis.py 프로젝트: WeiNiu/lsfolk
 def user_tag_dict(inputfile):
     mr_class = UserTagDict
     output_file = f_user_tag_dict
     inputfilelist = []
     inputfilelist.append(inputfile)
     runMRJob(mr_class,
              output_file,
              inputfilelist,
              jobconf = {'mapred.reduce.tasks':10})
예제 #10
0
파일: analysis.py 프로젝트: WeiNiu/lsfolk
 def label_cnt(inputfile):
     mr_class = CountLabel
     output_file = f_label_cnt
     inputfilelist = []
     inputfilelist.append(inputfile)
     runMRJob(mr_class,
              output_file,
              inputfilelist,
              )        
예제 #11
0
파일: analysis.py 프로젝트: WeiNiu/lsfolk
 def get_sp(inputfile):
     mr_class = SP
     output_file = f_sp
     inputfilelist = []
     inputfilelist.append(inputfile)
     runMRJob(mr_class,
              output_file,
              inputfilelist,
              )
예제 #12
0
파일: analysis.py 프로젝트: WeiNiu/lsfolk
 def topic_rank(inputfile):
     mr_class = TopicRank
     output_file = f_topic_rank
     inputfilelist = []
     inputfilelist.append(inputfile)
     runMRJob(mr_class,
              output_file,
              inputfilelist,
              jobconf = {'mapred.reduce.tasks':10})   
예제 #13
0
파일: analysis.py 프로젝트: WeiNiu/lsfolk
 def sift_tag_by_tagger(inputfile):
     mr_class = SiftTagByTagger
     output_file = f_sifted_tag_tagger
     inputfilelist = []
     inputfilelist.append(inputfile)
     runMRJob(mr_class,
              output_file,
              inputfilelist,
              jobconf = {'mapred.reduce.tasks':10})   
예제 #14
0
파일: analysis.py 프로젝트: WeiNiu/lsfolk
 def tag_cooccurence1(inputfile):
     mr_class = TagCooccur1
     output_file = f_tag_cooccur_1
     inputfilelist = []
     inputfilelist.append(inputfile)
     runMRJob(mr_class,
              output_file,
              inputfilelist,
              jobconf = {'mapred.reduce.tasks':10})   
예제 #15
0
파일: analysis.py 프로젝트: WeiNiu/lsfolk
 def topic_rank(inputfile):
     mr_class = TopicRank
     output_file = f_topic_rank
     inputfilelist = []
     inputfilelist.append(inputfile)
     runMRJob(mr_class,
              output_file,
              inputfilelist,
              jobconf={'mapred.reduce.tasks': 10})
예제 #16
0
파일: analysis.py 프로젝트: WeiNiu/lsfolk
 def sift_tag_by_tagger(inputfile):
     mr_class = SiftTagByTagger
     output_file = f_sifted_tag_tagger
     inputfilelist = []
     inputfilelist.append(inputfile)
     runMRJob(mr_class,
              output_file,
              inputfilelist,
              jobconf={'mapred.reduce.tasks': 10})
예제 #17
0
 def true_cosine2(inputfile):
     mr_class = TrueCosine2
     output_file = f_cosine_tag
     inputfilelist = []
     inputfilelist.append(inputfile)
     runMRJob(mr_class,
              output_file,
              inputfilelist,
              jobconf={'mapred.reduce.tasks': 2})
예제 #18
0
파일: analysis.py 프로젝트: WeiNiu/lsfolk
 def zj_list(inputfile):
     mr_class = ZJ_list
     output_file = f_zj_in
     inputfilelist = []
     inputfilelist.append(inputfile)
     runMRJob(mr_class,
              output_file,
              inputfilelist,
              jobconf={'mapred.reduce.tasks': 10})
예제 #19
0
 def word_object_extractor():
     mr_class = WordObjectExtractor
     output_file = f_word_objects_extractor
     runMRJob(
                  mr_class,
                  output_file,
                  [f_hdfs_hashtags],
                  jobconf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000}
              )
예제 #20
0
파일: analysis.py 프로젝트: WeiNiu/lsfolk
 def true_cosine2(inputfile):
     mr_class =  TrueCosine2
     output_file = f_cosine_tag
     inputfilelist = []
     inputfilelist.append(inputfile)
     runMRJob(mr_class,
              output_file,
              inputfilelist,
              jobconf = {'mapred.reduce.tasks':2})
예제 #21
0
파일: analysis.py 프로젝트: WeiNiu/lsfolk
 def zj_list(inputfile):
     mr_class = ZJ_list
     output_file = f_zj_in
     inputfilelist = []
     inputfilelist.append(inputfile)
     runMRJob(mr_class,
              output_file,
              inputfilelist,
              jobconf = {'mapred.reduce.tasks':10})   
예제 #22
0
 def run_job(mr_class, output_file, input_files_start_time, input_files_end_time):
     PARAMS_DICT['input_files_start_time'] = time.mktime(input_files_start_time.timetuple())
     PARAMS_DICT['input_files_end_time'] = time.mktime(input_files_end_time.timetuple())
     print 'Running map reduce with the following params:', pprint(PARAMS_DICT)
     runMRJob(mr_class,
              output_file,
              MRAnalysis.get_input_files_with_tweets(input_files_start_time, input_files_end_time),
              jobconf={'mapred.reduce.tasks':500})
     FileIO.writeToFileAsJson(PARAMS_DICT, output_file)
예제 #23
0
파일: analysis.py 프로젝트: WeiNiu/lsfolk
 def user_tag_dict(inputfile):
     mr_class = UserTagDict
     output_file = f_user_tag_dict
     inputfilelist = []
     inputfilelist.append(inputfile)
     runMRJob(mr_class,
              output_file,
              inputfilelist,
              jobconf={'mapred.reduce.tasks': 10})
예제 #24
0
	def detect(infiles, outfile):
		mr_class = LangDetectTwitter
		runMRJob(mr_class,
			outfile,
			infiles,
			mrJobClassParams = {'job_id': 'as'},
			args = [],
			jobconf={'mapred.reduce.tasks':300, 'mapred.task.timeout':86400000}
		)
예제 #25
0
파일: analysis.py 프로젝트: WeiNiu/lsfolk
 def label_cnt(inputfile):
     mr_class = CountLabel
     output_file = f_label_cnt
     inputfilelist = []
     inputfilelist.append(inputfile)
     runMRJob(
         mr_class,
         output_file,
         inputfilelist,
     )
예제 #26
0
파일: analysis.py 프로젝트: WeiNiu/lsfolk
 def get_sp(inputfile):
     mr_class = SP
     output_file = f_sp
     inputfilelist = []
     inputfilelist.append(inputfile)
     runMRJob(
         mr_class,
         output_file,
         inputfilelist,
     )
예제 #27
0
def mr_analysis(startTime, endTime, outputFolder, inputFilesStartTime=None, inputFilesEndTime=None):
    if not inputFilesStartTime: inputFilesStartTime=startTime; inputFilesEndTime=endTime
#    outputFile = hashtagsWithEndingWindowFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
#    outputFile = hashtagsWithoutEndingWindowFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
#    outputFile = hashtagsWithoutEndingWindowWithoutLatticeApproximationFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
#    outputFile = hashtagsAllOccurrencesWithinWindowFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
#    outputFile = timeUnitWithOccurrencesFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
    outputFile = latticeGraphFile%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
#    outputFile = 'mr_Data/timeUnitWithOccurrences'
    runMRJob(MRAnalysis, outputFile, getInputFiles(inputFilesStartTime, inputFilesEndTime), jobconf={'mapred.reduce.tasks':300})
    FileIO.writeToFileAsJson(PARAMS_DICT, outputFile)
예제 #28
0
	def filter_tweets(input_files_start_time, input_files_end_time):
		mr_class = FilterElectionTweets
		output_file = os.path.expanduser('~/ElectionTweetsAnalysis/data/%s/') % 'results/filter' + 'tweets'
		input_folder = '/mnt/chevron/bde/Data/TweetData/SampleTweets/2012/'
		runMRJob(mr_class,
							output_file,
							FilterElectionTweetsMRJobRunner.get_dated_input_files(input_files_start_time, input_files_end_time, input_folder),
							mrJobClassParams = {'job_id': 'as'},
							#args = [],
							jobconf = {'map.reduce.tasks': 300}
		)
예제 #29
0
 def filter_tweets(input_files):
   mr_class = FilterTweets
   output_file = f_local_tweets_filtered
   runMRJob(mr_class,
            output_file,
            input_files,
            mrJobClassParams = {'job_id': 'as'},
            # uncomment when running on local
            #args = [],
            jobconf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000}
   )
 def geo_analysis(input_files):
   mr_class = TweetsGeoAnalysis
   output_file = f_geo_distrib
   runMRJob(mr_class,
            output_file,
            input_files,
            mrJobClassParams = {'job_id': 'as'},
            # uncomment when running on local
            #args = [],
            jobconf={'mapred.reduce.tasks':300, 'mapred.task.timeout': 86400000}
   )
 def tweet_texts(input_files):
   mr_class = TweetTexts
   output_file = f_tweet_texts
   runMRJob(mr_class,
            output_file,
            input_files,
            mrJobClassParams = {'job_id': 'as'},
            # uncomment when running on local
            #args = [],
            jobconf={'mapred.reduce.tasks':300, 'mapred.task.timeout': 86400000}
   )
 def location_user_pairs(input_files):
   mr_class = LocationUserPairs
   output_file = os.path.expanduser('~/LocalExperts/data/results/%s/') % 'local_tweets' + 'locationsbymentions'
   runMRJob(mr_class,
            output_file,
            input_files,
            mrJobClassParams = {'job_id': 'as'},
            # uncomment when running on local
            #args = [],
            jobconf={'mapred.reduce.tasks':300, 'mapred.task.timeout': 86400000}
   )
예제 #33
0
 def user_mention_map(input_files):
   mr_class = UserMentionTweets
   output_file = f_mentions
   runMRJob(mr_class,
            output_file,
            input_files,
            mrJobClassParams = {'job_id': 'as'},
            # uncomment when running on local
            #args = [],
            jobconf={'mapred.reduce.tasks':500, 'mapred.task.timeout': 86400000}
   )
예제 #34
0
 def filter_tweets(input_files_start_time, input_files_end_time):
     mr_class = FilterElectionTweets
     output_file = os.path.expanduser(
         '~/ElectionTweetsAnalysis/data/%s/') % 'results/filter' + 'tweets'
     input_folder = '/mnt/chevron/bde/Data/TweetData/SampleTweets/2012/'
     runMRJob(
         mr_class,
         output_file,
         FilterElectionTweetsMRJobRunner.get_dated_input_files(
             input_files_start_time, input_files_end_time, input_folder),
         mrJobClassParams={'job_id': 'as'},
         #args = [],
         jobconf={'map.reduce.tasks': 300})
예제 #35
0
 def noun_extractor(input_files):
   mr_class = POSTagger
   output_file = noun_cloud_output
   runMRJob(mr_class,
            output_file,
            # uncomment when running on local
            #fs.get_local_input_files(local_tweets_input_folder),
            input_files,
            mrJobClassParams = {'job_id': 'as'},
            # uncomment when running on local
            #args = [],
            jobconf={'mapred.reduce.tasks':300, 'mapred.task.timeout': 86400000}
   )
예제 #36
0
 def get_users(input_files_start_time, input_files_end_time, input_folder):
   mr_class = Users
   output_file = f_users
   input_files = fs.get_dated_input_files(input_files_start_time,
                                     input_files_end_time,
                                     input_folder)
   runMRJob(mr_class,
            output_file,
            input_files,
            mrJobClassParams = {'job_id': 'as'},
            # uncomment when running on local
            #args = [],
            jobconf={'mapred.reduce.tasks':300, 'mapred.task.timeout': 8640000}
   )
예제 #37
0
def mr_analysis(startTime, endTime, outputFolder, inputFilesStartTime=None, inputFilesEndTime=None):
    if not inputFilesStartTime:
        inputFilesStartTime = startTime
        inputFilesEndTime = endTime
    outputFile = f_hashtag_objects % (outputFolder, startTime.strftime("%Y-%m-%d"), endTime.strftime("%Y-%m-%d"))
    #    outputFile = location_objects_file%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
    #    outputFile = f_ltuo_location_and_ltuo_hashtag_and_occurrence_time%(outputFolder, startTime.strftime('%Y-%m-%d'), endTime.strftime('%Y-%m-%d'))
    runMRJob(
        MRAnalysis,
        outputFile,
        getInputFiles(inputFilesStartTime, inputFilesEndTime),
        jobconf={"mapred.reduce.tasks": 300},
    )
    FileIO.writeToFileAsJson(PARAMS_DICT, outputFile)
예제 #38
0
 def count_at_mentions(input_files_start_time, input_files_end_time):
   mr_class = CountAtMentionTweets
   output_file = f_count_at_mentions
   runMRJob(mr_class,
            output_file,
            # uncomment when running on local
            #fs.get_local_input_files(local_tweets_input_folder),
            fs.get_dated_input_files(input_files_start_time,
                                     input_files_end_time,
                                     input_folder),
            mrJobClassParams = {'job_id': 'as'},
            # uncomment when running on local
            #args = [],
            jobconf={'mapred.reduce.tasks':300, 'mapred.task.timeout': 86400000}
   )
예제 #39
0
 def get_pins(input_files_start_time, input_files_end_time, input_folder):
   mr_class = Pins
   output_file = f_pins
   chevron_files = fs.get_dated_input_files(input_files_start_time,
                                     input_files_end_time,
                                     input_folder)
   
   '''
   hdfs_files = []
   for file in chevron_files:
     hdfs_files = hdfs_rel_path + file
   ''' 
   runMRJob(mr_class,
            output_file,
            chevron_files,
            mrJobClassParams = {'job_id': 'as'},
            # uncomment when running on local
            #args = [],
            jobconf={'mapred.reduce.tasks':300, 'mapred.task.timeout': 8640000}
   )
예제 #40
0
 def get_pins(input_files_start_time, input_files_end_time, input_folder):
     mr_class = Pins
     output_file = f_pins
     chevron_files = fs.get_dated_input_files(input_files_start_time,
                                              input_files_end_time,
                                              input_folder)
     '''
 hdfs_files = []
 for file in chevron_files:
   hdfs_files = hdfs_rel_path + file
 '''
     runMRJob(
         mr_class,
         output_file,
         chevron_files,
         mrJobClassParams={'job_id': 'as'},
         # uncomment when running on local
         #args = [],
         jobconf={
             'mapred.reduce.tasks': 300,
             'mapred.task.timeout': 8640000
         })
예제 #41
0
파일: analysis.py 프로젝트: WeiNiu/lsfolk
 def RunJob(mr_class, outputfile, input_file_start_time,
            input_file_end_time):
     runMRJob(mr_class,
              outputfile,
              getInputFiles(input_file_start_time, input_file_end_time),
              jobconf={'mapred.reduce.tasks': 100})