def mapper(self, key, hashtag_object): hashtag = hashtag_object['hashtag'] ltuo_occ_time_and_occ_location = hashtag_object['ltuo_occ_time_and_occ_location'] if ltuo_occ_time_and_occ_location: ltuo_intvl_time_and_occ_location = [( GeneralMethods.approximateEpoch(occ_time, TIME_UNIT_IN_SECONDS), occ_location ) for occ_time, occ_location in ltuo_occ_time_and_occ_location] points = [UTMConverter.getLatLongUTMIdInLatLongForm(loc) for _, loc in ltuo_occ_time_and_occ_location] ltuo_intvl_time_and_items =\ GeneralMethods.group_items_by(ltuo_intvl_time_and_occ_location, key=itemgetter(0)) ltuo_intvl_time_and_items.sort(key=itemgetter(0)) first_time = ltuo_intvl_time_and_items[0][0] ltuo_iid_and_occ_count = map(lambda (t, it): ((t-first_time)/TIME_UNIT_IN_SECONDS, len(it)), ltuo_intvl_time_and_items) ltuo_location_and_items =\ GeneralMethods.group_items_by(ltuo_intvl_time_and_occ_location, key=itemgetter(1)) mf_location_to_occ_count = dict(map(lambda (l, it): (l, len(it)), ltuo_location_and_items)) spatial_metrics = { 'hashtag': hashtag, 'num_of_occurrenes': len(ltuo_occ_time_and_occ_location), 'peak_iid': max(ltuo_iid_and_occ_count, key=itemgetter(1))[0], 'focus': focus(mf_location_to_occ_count), 'entropy': entropy(mf_location_to_occ_count, as_bits=False), 'spread': getRadiusOfGyration(points) } yield hashtag, spatial_metrics
def reducer(self, location, it_performance_values): performance_values = list(chain(*it_performance_values)) performance_summary = defaultdict(list) for prediction_method, pvs_for_prediction_method in \ GeneralMethods.group_items_by(performance_values, key=itemgetter('prediction_method')): for metric, pvs_for_prediction_method_and_metric in \ GeneralMethods.group_items_by(pvs_for_prediction_method, key=itemgetter('metric')): performance_summary[metric].append([ prediction_method, pvs_for_prediction_method_and_metric[0]['metric_value'] ]) yield '', dict(location=location, performance_summary=performance_summary)
def mapper(self, key, hashtag_object): ltuo_occ_time_and_occ_location = hashtag_object['ltuo_occ_time_and_occ_location'] if ltuo_occ_time_and_occ_location: ltuo_intvl_time_and_occ_location = [( GeneralMethods.approximateEpoch(occ_time, TIME_UNIT_IN_SECONDS), occ_location ) for occ_time, occ_location in ltuo_occ_time_and_occ_location] ltuo_intvl_time_and_items =\ GeneralMethods.group_items_by(ltuo_intvl_time_and_occ_location, key=itemgetter(0)) ltuo_intvl_time_and_items.sort(key=itemgetter(0)) first_time = ltuo_intvl_time_and_items[0][0] intvl_method = lambda (t, it): ((t-first_time)/TIME_UNIT_IN_SECONDS, (t, len(it))) ltuo_iid_and_tuo_interval_and_occurrence_count = map(intvl_method, ltuo_intvl_time_and_items) peak_tuo_iid_and_tuo_interval_and_occurrence_count = \ max( ltuo_iid_and_tuo_interval_and_occurrence_count, key=lambda (_, (__, occurrence_count)): occurrence_count ) peak_iid = peak_tuo_iid_and_tuo_interval_and_occurrence_count[0] current_val = 0.0 total_occurrences = sum(data[1][1] for data in ltuo_iid_and_tuo_interval_and_occurrence_count) for iid, (_, occurrence_count) in ltuo_iid_and_tuo_interval_and_occurrence_count: is_peak = 0.0 if iid==peak_iid: is_peak=1.0 current_val+=occurrence_count yield iid, [is_peak, occurrence_count/total_occurrences, current_val/total_occurrences]
def mapper(self, key, hashtag_object): if False: yield hashtag = hashtag_object['hashtag'] ltuo_occ_time_and_occ_location = hashtag_object['ltuo_occ_time_and_occ_location'] ltuo_location_and_items = GeneralMethods.group_items_by(ltuo_occ_time_and_occ_location, key=itemgetter(1)) for location, items in ltuo_location_and_items: self.mf_location_to_unique_hashtags[location].add(hashtag) self.mf_location_to_occurrences_count[location]+=len(items)
def reducer(self, location, it_ltuo_occ_time_and_count): ltuo_occ_time_and_count = [(t, sum(zip(*l)[1])) for t, l in GeneralMethods.group_items_by( list(chain(*it_ltuo_occ_time_and_count)), key=itemgetter(0) ) ] yield location, {'location': location, 'ltuo_occ_time_and_count': ltuo_occ_time_and_count}
def get_components_by_clustering(self, graph): _, ltuo_node_and_cluster_id = clusterUsingAffinityPropagation(graph) ltuo_cluster_id_and_ltuo_node_id_and_cluster_id = GeneralMethods.group_items_by( ltuo_node_and_cluster_id, itemgetter(1) ) ltuo_cluster_id_and_nodes = map( lambda (c_i, l_n_c): (c_i, zip(*l_n_c)[0]), ltuo_cluster_id_and_ltuo_node_id_and_cluster_id ) return zip(*ltuo_cluster_id_and_nodes)[1]
def mapper(self, key, hashtag_object): if 'hashtag' in hashtag_object: hashtag_object = cjson.decode(hashtag_object) ltuo_occ_time_and_occ_location = hashtag_object.get('ltuo_occ_time_and_occ_location', []) ltuo_location_and_items = GeneralMethods.group_items_by(ltuo_occ_time_and_occ_location, key=itemgetter(1)) ltuo_location_and_items = filter( lambda (location, items): len(items)>=MIN_HASHTAG_OCCURRENCES_PER_LOCATION, ltuo_location_and_items ) hashtag_object['ltuo_occ_time_and_occ_location'] =\ list(chain(*map(lambda (_, items): items, ltuo_location_and_items))) yield hashtag_object['hashtag'], hashtag_object
def mapper(self, hashtag, hashtag_object): def distance_from_overall_locality_stat(overall_stat, current_stat): return overall_stat-current_stat ltuo_occ_time_and_occ_location = hashtag_object['ltuo_occ_time_and_occ_location'] if ltuo_occ_time_and_occ_location: ltuo_intvl_time_and_occ_location = [( GeneralMethods.approximateEpoch(occ_time, TIME_UNIT_IN_SECONDS), occ_location ) for occ_time, occ_location in ltuo_occ_time_and_occ_location] ltuo_intvl_time_and_items =\ GeneralMethods.group_items_by(ltuo_intvl_time_and_occ_location, key=itemgetter(0)) ltuo_intvl_time_and_items.sort(key=itemgetter(0)) first_time = ltuo_intvl_time_and_items[0][0] intvl_method = lambda (t, it): ((t-first_time)/TIME_UNIT_IN_SECONDS, (t, map(itemgetter(1), it))) ltuo_iid_and_tuo_interval_and_lids = map(intvl_method, ltuo_intvl_time_and_items) peak_tuo_iid_and_tuo_interval_and_lids = \ max(ltuo_iid_and_tuo_interval_and_lids, key=lambda (_, (__, lids)): len(lids)) peak_iid = peak_tuo_iid_and_tuo_interval_and_lids[0] ltuo_location_and_items =\ GeneralMethods.group_items_by(ltuo_intvl_time_and_occ_location, key=itemgetter(1)) overall_mf_lid_to_occurrence_count = dict(map(lambda (l, it): (l, len(it)), ltuo_location_and_items)) overall_points =\ [UTMConverter.getLatLongUTMIdInLatLongForm(loc) for _, loc in ltuo_occ_time_and_occ_location] overall_entropy = entropy(overall_mf_lid_to_occurrence_count, False) overall_focus = focus(overall_mf_lid_to_occurrence_count)[1] overall_coverage = getRadiusOfGyration(overall_points) total_occurrences = sum(len(lids) for (iid, (interval, lids)) in ltuo_iid_and_tuo_interval_and_lids) for iid, (_, lids) in ltuo_iid_and_tuo_interval_and_lids: mf_lid_to_occurrence_count = defaultdict(float) for lid in lids: mf_lid_to_occurrence_count[lid]+=1 points = [UTMConverter.getLatLongUTMIdInLatLongForm(lid) for lid in lids] current_entropy = entropy(mf_lid_to_occurrence_count, False) current_focus = focus(mf_lid_to_occurrence_count)[1] current_coverage = getRadiusOfGyration(points) yield iid-peak_iid, [len(lids)/total_occurrences, current_entropy, current_focus, current_coverage, distance_from_overall_locality_stat(overall_entropy, current_entropy), distance_from_overall_locality_stat(overall_focus, current_focus), distance_from_overall_locality_stat(overall_coverage, current_coverage),]
def entropy_examples(): output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.png' data = [d for d in FileIO.iterateJsonFromFile(f_hashtag_spatial_metrics, remove_params_dict=True)] ltuo_hashtag_and_num_of_occurrences_and_entropy =\ map( itemgetter('hashtag', 'num_of_occurrenes', 'entropy'), data ) ltuo_hashtag_and_num_of_occurrences_and_entropy =\ map( lambda (h, n, e): (h, n, round(e,0)), ltuo_hashtag_and_num_of_occurrences_and_entropy ) for entropy, entropy_data in \ GeneralMethods.group_items_by(ltuo_hashtag_and_num_of_occurrences_and_entropy, itemgetter(2)): entropy_data.sort(key=itemgetter(1)) hashtags = map(itemgetter(0), entropy_data) print entropy, len(entropy_data), hashtags[:25]
def mapper1(self, key, hashtag_object): if False: yield hashtag = hashtag_object['hashtag'] ltuo_occ_time_and_occ_location = hashtag_object['ltuo_occ_time_and_occ_location'] ltuo_location_and_items = GeneralMethods.group_items_by(ltuo_occ_time_and_occ_location, key=itemgetter(1)) ltuo_location_and_occurrence_time =\ [(location, min(items, key=itemgetter(0))[0])for location, items in ltuo_location_and_items] ltuo_location_and_occurrence_time = [( location, GeneralMethods.approximateEpoch(occurrence_time, TIME_UNIT_IN_SECONDS) ) for location, occurrence_time in ltuo_location_and_occurrence_time] if ltuo_location_and_occurrence_time: occurrence_times = filter_outliers(zip(*ltuo_location_and_occurrence_time)[1]) ltuo_location_and_occurrence_time =\ filter(lambda (l, o): o in occurrence_times, ltuo_location_and_occurrence_time) for location, occurrence_time in ltuo_location_and_occurrence_time: self.mf_location_to_ltuo_hashtag_and_min_occ_time[location].append([hashtag, occurrence_time]) for neighbor_location, _ in ltuo_location_and_occurrence_time: if location!=neighbor_location: self.mf_location_to_neighbor_locations[location].add(neighbor_location)
def example_for_caverlee(): # valid_locations = ['18T_585E_4512N', '18T_587E_4514N'] mf_lid_to_location = dict([ ('18T_585E_4512N', 'Times Square'), ('18T_587E_4514N', 'Central Park'), ('18T_584E_4511N', 'Penn Station'), ('18T_585E_4511N', 'Empire State Building'), ]) output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.png' subplot_num = 221 # plt.figure(num=None, figsize=(6,3)) for data in FileIO.iterateJsonFromFile(f_example_for_caverlee, remove_params_dict=True): location = data['location'] if location in mf_lid_to_location: td = timedelta(hours=-5) ltuo_occ_time_and_count = data['ltuo_occ_time_and_count'] ltuo_occ_time_and_count.sort(key=itemgetter(0)) occ_times, counts = zip(*ltuo_occ_time_and_count) occ_times = map(datetime.fromtimestamp, occ_times) occ_times = map(lambda d: d+td, occ_times) occ_hours = map(lambda d: d.hour, occ_times) ltuo_occ_hour_and_count = zip(occ_hours, counts) ltuo_occ_hour_and_count = [(h, sum(zip(*h_c)[1])) for h, h_c in GeneralMethods.group_items_by(ltuo_occ_hour_and_count, key=itemgetter(0))] occ_hours, counts = zip(*ltuo_occ_hour_and_count) total_counts = sum(counts)+0.0 counts = map(lambda c: c/total_counts, counts) plt.subplot(subplot_num) # plt.subplots_adjust(bottom=0.2, top=0.9) subplot_num+=1 plt.plot(occ_hours, counts, color='#EA00FF', lw=1) plt.fill_between(occ_hours, counts, color='#EA00FF', alpha=0.25) # plt.ylabel('% of tweets') plt.xlabel('Time of day') plt.xlim(xmax=23) plt.ylim(ymax=0.09) plot_anchored_text(mf_lid_to_location[location], loc=2) plt.grid(True) # savefig(output_file_format%mf_lid_to_location[location].replace(' ', '_')) savefig(output_file_format%'ny_locations')
def mapper_final(self): for location, occ_times in self.mf_location_to_occ_times.iteritems(): ltuo_occ_time_and_count = [(t, len(l)) for t, l in GeneralMethods.group_items_by(occ_times, lambda item: item)] yield location, ltuo_occ_time_and_count