Exemplo n.º 1
0
 def ef_plot():
     output_file = fld_data_analysis_results%GeneralMethods.get_method_id()+'.png'
     data = [d for d in FileIO.iterateJsonFromFile(f_hashtag_spatial_metrics, remove_params_dict=True)]
     ltuo_hashtag_and_entropy_and_focus = map(itemgetter('hashtag', 'entropy', 'focus'), data)
     mf_norm_focus_to_entropies = defaultdict(list)
     for _, entropy, (_, focus) in ltuo_hashtag_and_entropy_and_focus:
         mf_norm_focus_to_entropies[round(focus, 2)].append(entropy)
     plt.figure(num=None, figsize=(6,3))
     x_focus, y_entropy = zip(*[(norm_focus, np.mean(entropies))
                                 for norm_focus, entropies in mf_norm_focus_to_entropies.iteritems()
                                 if len(entropies)>0])
     plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0)
     plt.scatter(x_focus, y_entropy, s=50, lw=0, c='k')
     plt.xlim(xmin=-0.1, xmax=1.1)
     plt.ylim(ymin=-1, ymax=9)
     plt.xlabel('Mean hashtag focus')
     plt.ylabel('Mean hashtag entropy')
     plt.grid(True)
     savefig(output_file)
     ltuo_hashtag_and_r_entropy_and_focus =\
                                         sorted(ltuo_hashtag_and_entropy_and_focus, key=itemgetter(1), reverse=True)
     ltuo_hashtag_and_r_entropy_and_s_focus = sorted(ltuo_hashtag_and_r_entropy_and_focus, key=itemgetter(2))
     hashtags = zip(*ltuo_hashtag_and_r_entropy_and_s_focus)[0]
     print list(hashtags[:20])
     print list(reversed(hashtags))[:20]
Exemplo n.º 2
0
 def plot_correlation_ef_plot(condition, id, hashtags, focuses, entropies, peaks):
     TIME_UNIT_IN_SECONDS = 10.*60.
     mf_norm_focus_to_entropies = defaultdict(list)
     mf_norm_focus_to_peaks = defaultdict(list)
     for focus, entropy, peak in zip(focuses,entropies, peaks):
         if condition(peak):
             mf_norm_focus_to_entropies[round(focus, 2)].append(entropy)
             mf_norm_focus_to_peaks[round(focus, 2)].append(peak)
     x_focus, y_entropy = zip(*[(norm_focus, np.mean(entropies)) for norm_focus, entropies in mf_norm_focus_to_entropies.iteritems() if len(entropies)>5])
     _, z_peak = zip(*[(norm_focus, np.mean(peaks)*TIME_UNIT_IN_SECONDS/60) for norm_focus, peaks in mf_norm_focus_to_peaks.iteritems() if len(peaks)>5])
     plt.figure(num=None, figsize=(6,3))
     plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0)
     cm = matplotlib.cm.get_cmap('cool')
     sc = plt.scatter(x_focus, y_entropy, c=z_peak, cmap=cm, s=50, lw=0,)
     plt.colorbar(sc)
     plt.xlim(xmin=-0.1, xmax=1.1)
     plt.ylim(ymin=-1, ymax=9)
     plt.xlabel('Mean hashtag focus')
     plt.ylabel('Mean hashtag entropy')
     plt.grid(True)
     savefig(output_file_format%id)
     ltuo_hashtag_and_entropy_and_focus = zip(hashtags, entropies, focuses)
     ltuo_hashtag_and_r_entropy_and_focus = sorted(ltuo_hashtag_and_entropy_and_focus, key=itemgetter(1), reverse=True)
     ltuo_hashtag_and_r_entropy_and_s_focus = sorted(ltuo_hashtag_and_r_entropy_and_focus, key=itemgetter(2))
     hashtags = zip(*ltuo_hashtag_and_r_entropy_and_s_focus)[0]
     print id, list(hashtags)
     print id, list(reversed(hashtags))
Exemplo n.º 3
0
    def temporal_affinity_vs_distance():
        output_file = fld_data_analysis_results%GeneralMethods.get_method_id() + '.png'
        DataAnalysis._plot_affinities('adoption_lag')
        plt.xlabel('Distance (miles)')
        plt.ylabel('Hashtag adoption lag (hours)')
#        plt.show()
        savefig(output_file)
Exemplo n.º 4
0
 def significant_nei_utm_ids():
     output_folder = fld_google_drive_data_analysis%GeneralMethods.get_method_id()+'/%s.png'
     for i, data in enumerate(FileIO.iterateJsonFromFile(f_significant_nei_utm_ids, remove_params_dict=True)):
         utm_lat_long = UTMConverter.getLatLongUTMIdInLatLongForm(data['utm_id'])
         nei_utm_lat_longs = map(
                           lambda nei_utm_id: UTMConverter.getLatLongUTMIdInLatLongForm(nei_utm_id),
                           data['nei_utm_ids']
                         )
         if nei_utm_lat_longs:
             output_file = output_folder%('%s_%s'%(utm_lat_long))
             plotPointsOnWorldMap(nei_utm_lat_longs,
                                  blueMarble=False,
                                  bkcolor='#CFCFCF',
                                  lw = 0,
                                  color = '#EA00FF',
                                  alpha=1.)
             _, m = plotPointsOnWorldMap([utm_lat_long],
                                  blueMarble=False,
                                  bkcolor='#CFCFCF',
                                  lw = 0,
                                  color = '#2BFF00',
                                  s = 40,
                                  returnBaseMapObject=True,
                                  alpha=1.)
             for nei_utm_lat_long in nei_utm_lat_longs:
                 m.drawgreatcircle(utm_lat_long[1],
                                   utm_lat_long[0],
                                   nei_utm_lat_long[1],
                                   nei_utm_lat_long[0],
                                   color='#FFA600',
                                   lw=1.5,
                                   alpha=1.0)
             print 'Saving %s'%(i+1)
             savefig(output_file)
Exemplo n.º 5
0
    def content_affinity_vs_distance():
        output_file = fld_data_analysis_results%GeneralMethods.get_method_id() + '.png'
        DataAnalysis._plot_affinities('similarity')
        plt.xlabel('Distance (miles)')
        plt.ylabel('Hashtags sharing similarity')
#        plt.show()
        savefig(output_file)
Exemplo n.º 6
0
    def iid_vs_cumulative_distribution_and_peak_distribution():
        TIME_UNIT_IN_SECONDS = 10.*60.
        output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.png'
        ltuo_iid_and_interval_stats = [data for data in 
                                        FileIO.iterateJsonFromFile(f_iid_spatial_metrics, remove_params_dict=True)]
        ltuo_s_iid_and_interval_stats = sorted(ltuo_iid_and_interval_stats, key=itemgetter(0))
        ltuo_s_iid_and_tuo_is_peak_and_cumulative_percentage_of_occurrences = [(data[0], (data[1][0], data[1][2])) for data in ltuo_s_iid_and_interval_stats]
        total_peaks = sum([data[1][0] for data in ltuo_s_iid_and_tuo_is_peak_and_cumulative_percentage_of_occurrences])+0.0
        x_iids = []
        y_is_peaks = []
        z_cumulative_percentage_of_occurrencess = []
        for (iid, (is_peak, cumulative_percentage_of_occurrences)) in ltuo_s_iid_and_tuo_is_peak_and_cumulative_percentage_of_occurrences[:100]: 
            print (iid, (is_peak, cumulative_percentage_of_occurrences)) 
            x_iids.append((iid+1)*TIME_UNIT_IN_SECONDS/60)
            y_is_peaks.append(is_peak/total_peaks)
            z_cumulative_percentage_of_occurrencess.append(cumulative_percentage_of_occurrences)
        plt.figure(num=None, figsize=(4.3,3))
        plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0)
        plt.plot(x_iids, y_is_peaks, marker='o', c='k')
        plt.ylabel('Distribution of hashtags')
        plt.xlabel('Hashtag peak (minutes)')
        plt.grid(True)
        plt.xlim(xmax=600)
        savefig(output_file_format%'peaks')
        plt.clf()
        plt.figure(num=None, figsize=(6,3))
        plt.subplots_adjust(bottom=0.2, top=0.9, wspace=0, hspace=0)
        plt.plot(x_iids, z_cumulative_percentage_of_occurrencess, lw=0, marker='o', c='k')
#        plt.xlabel('Minutes')
        plt.ylabel('CDF of occurrences')
        plt.xlabel('Time (Minutes)')
        plt.grid(True)
        plt.xlim(xmax=600)
        savefig(output_file_format%'cdf_occurrences_peak')
Exemplo n.º 7
0
def performanceWithSpamDetection(generateData):
    experimentData = defaultdict(dict)
    ratios = [0.0,0.4,0.9]
    marker = dict([(0.0, 's'), (0.4, 'o'), (0.9, 'd')])
#    spammerPercentages = [0.2, 0.01, 0.01]
    spammerPercentages = [0.015, 0.015, 0.015]
    for iteration in range(10):
        for spamDetectionRatio, spammerPercentage in zip(ratios, spammerPercentages):
            experimentFileName = spamModelFolder+'performanceWithSpamDetection/%s/%0.3f'%(iteration,spamDetectionRatio)
            print experimentFileName
            if generateData:
                model = MixedUsersModel()
                conf = {'model': model, 'numberOfTimeSteps': 100, 'addUsersMethod': User.addUsersUsingRatioWithSpamDetection, 'analysisMethods': [(Analysis.measureRankingQuality, 1)], 'ratio': {'normal': 1-spammerPercentage, 'spammer': spammerPercentage},
    #                        'spammerMessagingProbability': spammerBudget,
                        'rankingMethods':[RankingModel.latestMessages, RankingModel.latestMessagesSpamFiltered, RankingModel.popularMessages, RankingModel.popularMessagesSpamFiltered],
                        'spamDetectionRatio': spamDetectionRatio,
                        'experimentFileName': experimentFileName}
                GeneralMethods.runCommand('rm -rf %s'%experimentFileName);run(**conf)
            else:
                for data in FileIO.iterateJsonFromFile(experimentFileName):
                    for ranking_id in data['spammmess']:
                        if data['currentTimeStep'] not in experimentData[spamDetectionRatio]: experimentData[spamDetectionRatio][data['currentTimeStep']]=defaultdict(list)
                        experimentData[spamDetectionRatio][data['currentTimeStep']][ranking_id]+=data['spammmess'][ranking_id]
    if not generateData:
        sdr = {}
        for spamDetectionRatio in sorted(experimentData.keys()):
            dataToPlot = defaultdict(list)
            for timeUnit in experimentData[spamDetectionRatio]:
                dataToPlot['x'].append(timeUnit)
                for ranking_id in experimentData[spamDetectionRatio][timeUnit]: dataToPlot[ranking_id].append(np.mean(experimentData[spamDetectionRatio][timeUnit][ranking_id]))
            sdr[spamDetectionRatio]=dataToPlot
        for ranking_id in [RankingModel.LATEST_MESSAGES_SPAM_FILTERED, RankingModel.POPULAR_MESSAGES_SPAM_FILTERED]:
#        for ranking_id in [RankingModel.LATEST_MESSAGES, RankingModel.POPULAR_MESSAGES]:
            for spamDetectionRatio in ratios:
                print ranking_id, spamDetectionRatio
                dataY = smooth(sdr[spamDetectionRatio][ranking_id],8)[:len(sdr[spamDetectionRatio]['x'])]
                dataX, dataY = sdr[spamDetectionRatio]['x'][10:], dataY[10:]
                print 'x', [x-10 for x in dataX]
                if spamDetectionRatio==0.0: 
                    print ranking_id, dataY
                    plt.plot([x-10 for x in dataX], dataY, label='%s'%(labels[ranking_id]), lw=1, marker=marker[spamDetectionRatio])
                else: 
                    print ranking_id, dataY
                    plt.plot([x-10 for x in dataX], dataY, label='%s (%d'%(labels[ranking_id].replace('Filtering', 'Detection'),spamDetectionRatio*100)+'%)', lw=1, marker=marker[spamDetectionRatio])
            plt.ylim(ymin=0, ymax=1)
            plt.xlim(xmin=0, xmax=75)
#            plt.title(ranking_id)
            plt.legend()
            plt.xlabel('Time', fontsize=16, fontweight='bold')
            plt.ylabel('Spamness', fontsize=16, fontweight='bold')
#            plt.show()
#            plt.savefig('performanceWithSpamDetection_%s.png'%ranking_id)
            savefig('performanceWithSpamDetection_%s.png'%ranking_id)
            plt.clf()
Exemplo n.º 8
0
    def top_k_locations_on_world_map():
        output_file = fld_data_analysis_results%GeneralMethods.get_method_id() + '.png'
        ltuo_location_and_occurrence_count = []
        for location_object in\
                FileIO.iterateJsonFromFile(f_dense_hashtag_distribution_in_locations, remove_params_dict=True):
            ltuo_location_and_occurrence_count.append([
                                                      location_object['location'],
                                                      location_object['occurrences_count']
                                                    ])
        ltuo_lid_and_r_occurrence_count = sorted(ltuo_location_and_occurrence_count, key=itemgetter(1), reverse=True)
#        for i, d in enumerate(ltuo_lid_and_r_occurrence_count):
#            print i, d
#        exit()
        lids = zip(*ltuo_lid_and_r_occurrence_count)[0][:200]
        points = map(UTMConverter.getLatLongUTMIdInLatLongForm, lids)
        plotPointsOnWorldMap(points, blueMarble=False, bkcolor='#CFCFCF', c='m',  lw = 0, alpha=1.)
        savefig(output_file)
Exemplo n.º 9
0
    def peak_stats():
        TIME_UNIT_IN_SECONDS = 10.*60.
        output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.png'
        data = [d for d in FileIO.iterateJsonFromFile(f_hashtag_spatial_metrics, remove_params_dict=True)]
        peaks = map(itemgetter('peak_iid'), data)
        peaks = filter(lambda i: i<288, peaks)
        ltuo_peak_and_count = [(peak, len(list(ito_peaks)))
                            for peak, ito_peaks in groupby(sorted(peaks))
                            ]
        ltuo_s_peak_and_count = sorted(ltuo_peak_and_count, key=itemgetter(0))        
        current_count = 0.0
        total_count = len(peaks)+0.
        print total_count
        ltuo_peak_and_cdf = []
        for peak, count, in ltuo_s_peak_and_count:
            current_count+=count
            ltuo_peak_and_cdf.append([(peak+1)*TIME_UNIT_IN_SECONDS/(60.), current_count/total_count ])
        x_peaks, y_cdf = zip(*ltuo_peak_and_cdf)
        plt.figure(num=None, figsize=(4.3,3))
        ax=plt.subplot(111)
        ax.set_xscale('log')
        plt.subplots_adjust(bottom=0.2, top=0.9, left=0.15)
        plt.scatter(x_peaks, y_cdf, c='k', s=50, lw=0)
        plt.xlabel('Time (minutes)')
        plt.ylabel('CDF')
        plt.xlim(xmin=5.)
        plt.grid(True)
#        plt.show()             
        savefig(output_file_format%'peak_cdf')
        plt.clf()
        
#        plt.figure(num=None, figsize=(4.3,3))
        ax=plt.subplot(111)
        ax.set_xscale('log')
        ax.set_yscale('log')
        x_peaks, y_counts = zip(*ltuo_s_peak_and_count)
        x_peaks = [(peak+1)*TIME_UNIT_IN_SECONDS/(60.) for peak in x_peaks]
        y_counts = [count/total_count for count in y_counts]
        plt.scatter(x_peaks, y_counts, c='k', s=50, lw=0)
        plt.xlabel('Time (minutes)')
        plt.ylabel('Distribution of hashtags')
        plt.xlim(xmin=5)
        plt.ylim(ymax=1., ymin=0.00005)
        plt.grid(True)
        savefig(output_file_format%'peak_dist')
Exemplo n.º 10
0
    def plot_global_influencers(ltuo_model_id_and_hashtag_tag):
        tuples_of_boundary_and_boundary_label = [
                ([[-90,-180], [90, 180]], 'World', 'm'),
            ]
        for model_id, hashtag_tag in ltuo_model_id_and_hashtag_tag:
            print model_id, hashtag_tag
            tuples_of_location_and_color = []
            for boundary, boundary_label, boundary_color in tuples_of_boundary_and_boundary_label:
                tuo_location_and_influence_scores = Experiments.load_tuo_location_and_boundary_influence_score(model_id, hashtag_tag, boundary)
                tuo_location_and_influence_scores = sorted(tuo_location_and_influence_scores, key=itemgetter(1))[:10]
                locations = zip(*tuo_location_and_influence_scores)[0]
                for location in locations: tuples_of_location_and_color.append([getLocationFromLid(location.replace('_', ' ')), boundary_color])
            locations, colors = zip(*tuples_of_location_and_color)
            plotPointsOnWorldMap(locations, blueMarble=False, bkcolor='#CFCFCF', c=colors,  lw = 0, alpha=1.)
            for _, boundary_label, boundary_color in tuples_of_boundary_and_boundary_label: plt.scatter([0], [0], label=boundary_label, c=boundary_color, lw = 0)
#            plt.legend(loc=3, ncol=4, mode="expand",)
#            plt.show()
            savefig(fld_results%(GeneralMethods.get_method_id()) +'%s_%s.png'%(model_id, hashtag_tag))
Exemplo n.º 11
0
    def location_influence_plots(model_ids, no_of_bins_for_influence_score=100):
        for model_id in model_ids:
            output_file_format = fld_results%(GeneralMethods.get_method_id()) + '%s_%s.png'
            tuo_input_location_and_label_and_marking_locations = [ 
#                                [ '40.6000_-73.9500', 'new_york', ['-23.2000_-46.4000', '-22.4750_-42.7750', '51.4750_0.0000', '33.3500_-118.1750', '29.7250_-97.1500','30.4500_-95.7000']],
                                ['29.7250_-97.1500', 'austin',  ['-23.2000_-46.4000', '-22.4750_-42.7750', '51.4750_0.0000', '33.3500_-118.1750', '39.1500_-83.3750','30.4500_-95.7000', '40.6000_-73.9500']], 
#                                ['30.4500_-95.7000', 'college_station', ['-23.2000_-46.4000', '-22.4750_-42.7750', '51.4750_0.0000', '33.3500_-118.1750', '29.7250_-97.1500','30.4500_-95.7000', '40.6000_-73.9500']],
                            ] 
            tuo_location_and_tuo_neighbor_location_and_influence_score = \
                Experiments.load_tuo_location_and_tuo_neighbor_location_and_pure_influence_score(model_id)
            for input_location, label, marking_locations in tuo_input_location_and_label_and_marking_locations:
                for location, tuo_neighbor_location_and_influence_score in \
                        tuo_location_and_tuo_neighbor_location_and_influence_score:
                    if input_location==location:
                        InfluenceAnalysis._plot_scores(tuo_neighbor_location_and_influence_score, marking_locations, no_of_bins_for_influence_score)
                        plt.xlim(-1,1); plt.ylim(ymin=0.0)
                        plt.show()
                        savefig(output_file_format%(label, model_id))
                        break
Exemplo n.º 12
0
    def plot_location_plots_with_zones(ltuo_model_id_and_hashtag_tag, no_of_bins_for_influence_score=100):
        output_file_format = fld_results+'/%s_%s.png'
        for model_id, hashtag_tag in ltuo_model_id_and_hashtag_tag:
            no_of_zones, ltuo_location_and_influence_score_and_zone_id = \
                Experiments.get_location_with_zone_ids(model_id, hashtag_tag)
            locations, influence_scores, zone_ids = zip(*ltuo_location_and_influence_score_and_zone_id)
#            print len(locations)
#            print [zone_id for _, _, zone_id in sorted(zip(locations, influence_scores, zone_ids), key=itemgetter(1))]
#            exit()
            # Plot influence plot
            ltuo_location_and_global_influence_score = zip(locations, influence_scores)
            max_y_tick = InfluenceAnalysis._plot_scores(ltuo_location_and_global_influence_score, [], no_of_bins_for_influence_score, smooth=True)
            # Plot zones
            ltuo_influence_score_and_zone_id = zip(influence_scores, zone_ids)
            ltuo_zone_id_and_influence_scores = [(zone_id, zip(*ito_tuo_influence_score_and_zone_id)[0])
                                                    for zone_id, ito_tuo_influence_score_and_zone_id in
                                                        groupby(
                                                                sorted(ltuo_influence_score_and_zone_id, key=itemgetter(1)),
                                                                key=itemgetter(1)
                                                        )
                                                ]
            ltuo_zone_id_and_tuo_min_influence_score_and_max_influence_score = \
                [(zone_id, (min(influence_scores), max(influence_scores))) for zone_id, influence_scores in ltuo_zone_id_and_influence_scores]
            ltuo_zone_id_and_tuo_box_start_and_box_width = \
                [(zone_id, (min_influence_score, abs(min_influence_score-max_influence_score))) 
                     for zone_id, (min_influence_score, max_influence_score) in 
                        ltuo_zone_id_and_tuo_min_influence_score_and_max_influence_score
                ]
            zone_ids, ltuo_box_start_and_box_width = zip(*ltuo_zone_id_and_tuo_box_start_and_box_width)
            zone_colors = [GeneralMethods.getRandomColor() for zone_id in zone_ids]
            plt.broken_barh(ltuo_box_start_and_box_width , (0, max_y_tick), facecolors=zone_colors, alpha=0.25, lw=0)
#            temp_ltuo_box_start_and_box_width = []
#            for box_start, box_width in ltuo_box_start_and_box_width:
#                if box_width!=0: temp_ltuo_box_start_and_box_width.append((box_start, box_width))
#                else: temp_ltuo_box_start_and_box_width.append((box_start, 0.0001))

#            zero_size_cluster_ltuo_box_start_and_box_width = []
#            for box_start, box_width in ltuo_box_start_and_box_width:
#                if box_width==0: zero_size_cluster_ltuo_box_start_and_box_width.append((box_start, 0.0001))
#            plt.broken_barh(zero_size_cluster_ltuo_box_start_and_box_width , (0, max_y_tick), facecolors='r', alpha=0.25, lw=0)
#            plt.xlim(xmin=-0.0025, xmax=0.0025)
            output_file = output_file_format%(GeneralMethods.get_method_id(), model_id, hashtag_tag)
            savefig(output_file)
Exemplo n.º 13
0
    def hashtag_locations_distribution_loglog():
        ltuo_no_of_locations_and_count = []
        for data in FileIO.iterateJsonFromFile(f_hashtag_and_location_distribution, remove_params_dict=True):
            if data[0]=='location' : ltuo_no_of_locations_and_count.append(data[1:])
        output_file = fld_data_analysis_results%GeneralMethods.get_method_id() + '.png'
        no_of_locations, counts = zip(*ltuo_no_of_locations_and_count)
        plt.figure(num=None, figsize=(4.3,3))
        plt.subplots_adjust(bottom=0.2, top=0.9, left=0.17)
        ax = plt.subplot(111)
        ax.set_xscale('log')
        ax.set_yscale('log')
        plt.scatter(no_of_locations, counts, c='k')
        plt.xlabel('No. of locations')
        plt.ylabel('No. of hashtags')
        plt.xlim(xmin=1/10, )
        plt.ylim(ymin=1/10, )
        plt.grid(True)
#        plt.show()
        savefig(output_file)
Exemplo n.º 14
0
    def global_influence_plots(ltuo_model_id_and_hashtag_tag, no_of_bins_for_influence_score=100):
        marking_locations = [
                             '18.8500_-98.6000',
#                             '2.9000_101.5000',
                             '51.4750_0.0000', 
                             '33.3500_-118.1750', 
#                             '-23.2000_-46.4000',
                            '-22.4750_-42.7750',
                            '39.1500_-83.3750',
                             '40.6000_-73.9500', 
                             '29.7250_-97.1500', 
                             '30.4500_-95.7000'
                             ]
        for model_id, hashtag_tag in ltuo_model_id_and_hashtag_tag:
            output_file = fld_results%(GeneralMethods.get_method_id()) + '%s_%s.png'%(model_id, hashtag_tag)
            tuo_location_and_global_influence_score = Experiments.load_tuo_location_and_boundary_influence_score(model_id, hashtag_tag)
            InfluenceAnalysis._plot_scores(tuo_location_and_global_influence_score, marking_locations, no_of_bins_for_influence_score, smooth=True)
            plt.ylim(ymin=0.0)
#            plt.show()
            savefig(output_file)
Exemplo n.º 15
0
 def utm_ids_on_map():
     ''' Plots utm ids on world map. The color indicates the
     log(total_hashtag_count)
     '''
     output_file = fld_google_drive_data_analysis%GeneralMethods.get_method_id()+'.png'
     ltuo_point_and_total_hashtag_count = []
     for utm_object in FileIO.iterateJsonFromFile(f_hashtags_by_utm_id, remove_params_dict=True):
         point = UTMConverter.getLatLongUTMIdInLatLongForm(utm_object['utm_id'])
         total_hashtag_count = log(utm_object['total_hashtag_count'])
         ltuo_point_and_total_hashtag_count.append((point, total_hashtag_count))
     points, total_hashtag_counts = zip(*sorted(ltuo_point_and_total_hashtag_count, key=itemgetter(1)))
     plotPointsOnWorldMap(points,
                          blueMarble=False,
                          bkcolor='#CFCFCF',
                          c=total_hashtag_counts,
                          cmap=matplotlib.cm.cool,
                          lw = 0,
                          alpha=1.)
     
     savefig(output_file)
Exemplo n.º 16
0
 def plot_graph(locality_measures, id):
     mf_apprx_to_count = defaultdict(float)
     for measure in locality_measures:
         mf_apprx_to_count[round(measure,3)]+=1
     total_hashtags = sum(mf_apprx_to_count.values())
     current_val = 0.0
     x_measure, y_distribution = [], []
     for apprx, count in sorted(mf_apprx_to_count.iteritems(), key=itemgetter(0)):
         current_val+=count
         x_measure.append(apprx)
         y_distribution.append(current_val/total_hashtags)
     plt.figure(num=None, figsize=(4.3,3))
     plt.subplots_adjust(bottom=0.2, top=0.9, left=0.15, wspace=0)
     plt.scatter(x_measure, y_distribution, lw=0, marker='o', c='k', s=25)
     plt.ylim(ymax=1.2)
     if id!='Coverage': plt.xlabel('%s'%id)
     else: plt.xlabel('%s (miles)'%id)
     plt.ylabel('CDF')
     plt.grid(True)
     savefig(output_file_format%('cdf_'+id))
Exemplo n.º 17
0
    def plot_local_influencers(ltuo_model_id_and_hashtag_tag):
        tuples_of_boundary_and_boundary_label = [
                ([[24.527135,-127.792969], [49.61071,-59.765625]], 'USA', GeneralMethods.getRandomColor()),
                ([[10.107706,-118.660469], [26.40009,-93.699531]], 'Mexico', GeneralMethods.getRandomColor()),
                ([[-16.6695,88.409841], [30.115057,119.698904]], 'SE-Asia', GeneralMethods.getRandomColor()),
                ([[-29.565473,-58.191719], [7.327985,-30.418282]], 'Brazil', GeneralMethods.getRandomColor()),
            ]
        for model_id, hashtag_tag in ltuo_model_id_and_hashtag_tag:
            print model_id, hashtag_tag
            tuples_of_location_and_color = []
            for boundary, boundary_label, boundary_color in tuples_of_boundary_and_boundary_label:
                tuo_location_and_influence_scores = Experiments.load_tuo_location_and_boundary_influence_score(model_id, hashtag_tag, boundary)
                tuo_location_and_influence_scores = sorted(tuo_location_and_influence_scores, key=itemgetter(1))[:10]
                locations = zip(*tuo_location_and_influence_scores)[0]
                for location in locations: tuples_of_location_and_color.append([getLocationFromLid(location.replace('_', ' ')), boundary_color])
            locations, colors = zip(*tuples_of_location_and_color)
            plotPointsOnWorldMap(locations, blueMarble=False, bkcolor='#CFCFCF', c=colors,  lw = 0, alpha=1.)
            for _, boundary_label, boundary_color in tuples_of_boundary_and_boundary_label: plt.scatter([0], [0], label=boundary_label, c=boundary_color, lw = 0)
            plt.legend(loc=3, ncol=4, mode="expand",)
#            plt.show()
            savefig(fld_results%(GeneralMethods.get_method_id()) +'%s_%s.png'%(model_id, hashtag_tag))
Exemplo n.º 18
0
    def plot_locations_influence_on_world_map(ltuo_model_id_and_hashtag_tag, noOfInfluencers=10, percentage_of_locations=0.15):
        input_locations = [
                               ('40.6000_-73.9500', 'new_york'),
                               ('33.3500_-118.1750', 'los_angeles'),
                               ('29.7250_-97.1500', 'austin'),
                           ('30.4500_-95.7000', 'college_station'),
                            ('-22.4750_-42.7750', 'rio'),
                           ('51.4750_0.0000', 'london'),
                           ('-23.2000_-46.4000', 'sao_paulo')
                         ] 
        for model_id, hashtag_tag in ltuo_model_id_and_hashtag_tag:
            tuo_location_and_tuo_neighbor_location_and_locations_influence_score = \
                    Experiments.load_tuo_location_and_tuo_neighbor_location_and_locations_influence_score(model_id, hashtag_tag, noOfInfluencers=None, influence_type=InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE)
            for input_location, label in input_locations:
                for location, tuo_neighbor_location_and_locations_influence_score in \
                        tuo_location_and_tuo_neighbor_location_and_locations_influence_score:
                    if input_location==location:
                        input_location = getLocationFromLid(input_location.replace('_', ' '))
                        output_file = fld_results%GeneralMethods.get_method_id() + '/%s_%s/%s.png'%(model_id, hashtag_tag, label)
                        number_of_outgoing_influences = int(len(tuo_neighbor_location_and_locations_influence_score)*percentage_of_locations)
                        if number_of_outgoing_influences==0: number_of_outgoing_influences=len(tuo_neighbor_location_and_locations_influence_score)
                        locations = zip(*tuo_neighbor_location_and_locations_influence_score)[0][:number_of_outgoing_influences]
                        locations = [getLocationFromLid(location.replace('_', ' ')) for location in locations]
#                        locations = filter(lambda location: isWithinBoundingBox(location, PARTIAL_WORLD_BOUNDARY), locations)
                        if locations:
                            _, m = plotPointsOnWorldMap(locations, resolution='c', blueMarble=False, bkcolor='#000000', c='#FF00FF', returnBaseMapObject=True, lw = 0)
#                            _, m = plotPointsOnWorldMap(locations, resolution='c', blueMarble=False, bkcolor='#CFCFCF', c='#FF00FF', returnBaseMapObject=True, lw = 0)
                            for location in locations: 
    #                            if isWithinBoundingBox(location, PARTIAL_WORLD_BOUNDARY): 
                                m.drawgreatcircle(location[1], location[0], input_location[1], input_location[0], color='#FAA31B', lw=1., alpha=0.5)
#                            plotPointsOnWorldMap([input_location], blueMarble=False, bkcolor='#CFCFCF', c='#003CFF', s=40, lw = 0)
                            plotPointsOnWorldMap([input_location], resolution='c', blueMarble=False, bkcolor='#000000', c='#003CFF', s=40, lw = 0)
#                            plotPointsOnWorldMap([input_location], resolution='c', blueMarble=False, bkcolor='#CFCFCF', c='#003CFF', s=40, lw = 0)
                            FileIO.createDirectoryForFile(output_file)
                            print output_file
                            savefig(output_file)
                            plt.clf()
                        else:
                            GeneralMethods.runCommand('rm -rf %s'%output_file)
                        break
Exemplo n.º 19
0
    def fraction_of_occurrences_vs_rank_of_location():
        output_file = fld_data_analysis_results%GeneralMethods.get_method_id() + '.png'
        ltuo_location_and_occurrence_count = []
        for location_object in\
                FileIO.iterateJsonFromFile(f_dense_hashtag_distribution_in_locations, remove_params_dict=True):
            ltuo_location_and_occurrence_count.append([
                                                      location_object['location'],
                                                      location_object['occurrences_count']
                                                    ])
#        ltuo_location_and_occurrence_count.sort(key=itemgetter(1))
#        for location, occurrence_count in ltuo_location_and_occurrence_count:
#            print location, occurrence_count
#        exit()
        total_occurrences = sum(zip(*ltuo_location_and_occurrence_count)[1]) + 0.0
        ltuo_lid_and_r_occurrence_count = sorted(ltuo_location_and_occurrence_count, key=itemgetter(1), reverse=True)
        y_fraction_of_occurrences = [r_occurrence_count/total_occurrences for _, r_occurrence_count in ltuo_lid_and_r_occurrence_count]
#        total_locations = len(y_fraction_of_occurrences)+0.
#        x_percentage_of_locations = [x/total_locations for x in range(1,len(y_fraction_of_occurrences)+1)]
        x_percentage_of_locations = range(1,len(y_fraction_of_occurrences)+1)
        plt.figure(num=None, figsize=(6,3))
        plt.subplots_adjust(bottom=0.2, top=0.9)
        plt.semilogy(x_percentage_of_locations, y_fraction_of_occurrences, lw=0, marker='o', c='k')   
        plt.ylabel('Fraction of occurrences')
        plt.xlabel('Locations ordered by their ranks')
        plt.grid(True)
        
        a = plt.axes([.55, .5, .3, .3])
#        plt.plot(range(10))
        plt.semilogy(x_percentage_of_locations, y_fraction_of_occurrences, lw=0, marker='o', c='k')   
#        plt.title('Probability')
        plt.grid(True)
        yticks = plt.yticks()
        plt.yticks([yticks[0][-1], yticks[0][0]])
#        plt.ylim(ymin=0.000001, ymax=0.15)
#        plt.ylim(ymin=-0.01, ymax=0.04)
        plt.xlim(xmin=-4, xmax=200)
        plt.setp(a)
        
#        plt.show()
        savefig(output_file)
Exemplo n.º 20
0
    def example_for_caverlee():
#        valid_locations = ['18T_585E_4512N', '18T_587E_4514N']
        mf_lid_to_location = dict([
                                   ('18T_585E_4512N', 'Times Square'),
                                   ('18T_587E_4514N', 'Central Park'),
                                   ('18T_584E_4511N', 'Penn Station'),
                                   ('18T_585E_4511N', 'Empire State Building'),
                                   ])
        output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.png'
        subplot_num = 221
#        plt.figure(num=None, figsize=(6,3))
        for data in FileIO.iterateJsonFromFile(f_example_for_caverlee, remove_params_dict=True):
            location = data['location']
            if location in mf_lid_to_location:
                td = timedelta(hours=-5)
                ltuo_occ_time_and_count = data['ltuo_occ_time_and_count']
                ltuo_occ_time_and_count.sort(key=itemgetter(0))
                occ_times, counts = zip(*ltuo_occ_time_and_count)
                occ_times = map(datetime.fromtimestamp, occ_times)
                occ_times = map(lambda d: d+td, occ_times)
                occ_hours = map(lambda d: d.hour, occ_times)
                ltuo_occ_hour_and_count = zip(occ_hours, counts)
                ltuo_occ_hour_and_count = [(h, sum(zip(*h_c)[1])) for h, h_c in
                                            GeneralMethods.group_items_by(ltuo_occ_hour_and_count, key=itemgetter(0))]
                occ_hours, counts = zip(*ltuo_occ_hour_and_count)
                total_counts = sum(counts)+0.0
                counts = map(lambda c: c/total_counts, counts)
                plt.subplot(subplot_num)
#                plt.subplots_adjust(bottom=0.2, top=0.9)
                subplot_num+=1
                plt.plot(occ_hours, counts, color='#EA00FF', lw=1)
                plt.fill_between(occ_hours, counts, color='#EA00FF', alpha=0.25)
#                plt.ylabel('% of tweets')
                plt.xlabel('Time of day')
                plt.xlim(xmax=23)
                plt.ylim(ymax=0.09)
                plot_anchored_text(mf_lid_to_location[location], loc=2)
                plt.grid(True)
#                savefig(output_file_format%mf_lid_to_location[location].replace(' ', '_'))
        savefig(output_file_format%'ny_locations')
Exemplo n.º 21
0
 def plot_correlation_between_influence_similarity_and_jaccard_similarity(model_ids):
     for model_id in model_ids:
         mf_influence_type_to_mf_jaccard_similarity_to_influence_similarities = {}
         for line_count, (location, tuo_neighbor_location_and_mf_influence_type_and_similarity) in \
                 enumerate(FileIO.iterateJsonFromFile(tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id)):
             print line_count
             for neighbor_location, mf_influence_type_to_similarity in \
                     tuo_neighbor_location_and_mf_influence_type_and_similarity:
                 jaccard_similarity = round(mf_influence_type_to_similarity[JACCARD_SIMILARITY], 1)
                 for influence_type in \
                         [InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE, InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE]:
                     if influence_type not in mf_influence_type_to_mf_jaccard_similarity_to_influence_similarities: 
                         mf_influence_type_to_mf_jaccard_similarity_to_influence_similarities[influence_type] = defaultdict(list)
                     mf_influence_type_to_mf_jaccard_similarity_to_influence_similarities[influence_type][jaccard_similarity]\
                         .append(mf_influence_type_to_similarity[influence_type])
         subplot_id = 211
         for influence_type, mf_jaccard_similarity_to_influence_similarities in \
                 mf_influence_type_to_mf_jaccard_similarity_to_influence_similarities.iteritems():
             plt.subplot(subplot_id)
             x_jaccard_similarities, y_influence_similarities = [], []
             for jaccard_similarity, influence_similarities in \
                     sorted(mf_jaccard_similarity_to_influence_similarities.iteritems(), key=itemgetter(0)):
                 influence_similarities=filter_outliers(influence_similarities)
                 if len(influence_similarities) > 10:
                     x_jaccard_similarities.append(jaccard_similarity)
                     y_influence_similarities.append(np.mean(influence_similarities))
             rho, p_value = pearsonr(x_jaccard_similarities, y_influence_similarities)
             
             plt.scatter(x_jaccard_similarities, y_influence_similarities,  
                         c = InfluenceMeasuringModels.INFLUENCE_PROPERTIES[influence_type]['color'], 
                         lw=0, s=40)
             plt.plot(x_jaccard_similarities, y_influence_similarities, 
                         c = InfluenceMeasuringModels.INFLUENCE_PROPERTIES[influence_type]['color'],  lw=2)
             if influence_type==InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE: plt.ylabel('Influencing locations similarity', fontsize=13)
             else: plt.ylabel('Influenced locations similarity', fontsize=13)
             subplot_id+=1
         plt.xlabel('Jaccard similarity', fontsize=13)
         savefig('images/%s.png'%GeneralMethods.get_method_id())
Exemplo n.º 22
0
 def plot_graph(ltuo_locality_measure_and_occurrences_count, id):
     mf_normalized_occurrences_count_to_locality_measures = defaultdict(list)
     for locality_measure, occurrences_count in \
             ltuo_locality_measure_and_occurrences_count:
         normalized_occurrence_count =\
         int(occurrences_count/ACCURACY_NO_OF_OCCURRANCES)*ACCURACY_NO_OF_OCCURRANCES+ACCURACY_NO_OF_OCCURRANCES
         mf_normalized_occurrences_count_to_locality_measures[normalized_occurrence_count].append(
                                                                                                 locality_measure
                                                                                             )
     x_occurrance_counts, y_locality_measures = [], []
     for k in sorted(mf_normalized_occurrences_count_to_locality_measures):
         if len(mf_normalized_occurrences_count_to_locality_measures[k]) > 10:
             x_occurrance_counts.append(k), y_locality_measures.append(
                                              np.mean(mf_normalized_occurrences_count_to_locality_measures[k])
                                             )
     x_occurrance_counts = [x/1000. for x in x_occurrance_counts]
     plt.figure(num=None, figsize=(4.3,3.0))
     plt.subplots_adjust(bottom=0.2, top=0.9, left=0.15, wspace=0.)
     plt.scatter(x_occurrance_counts, y_locality_measures, lw=0, marker='o', c='k', s=50)
     plt.xlabel('Hashtag occurrences in thousands')
     plt.ylabel('Mean hashtag %s'%id)
     plt.grid(True)
     savefig(output_file_format%('locality_vs_occurrences_'+id))
Exemplo n.º 23
0
    def plot_correlation_between_influence_similarity_and_distance(model_ids, distance_accuracy=500):
        def get_larger_lid(lid): return getLatticeLid(getLocationFromLid(lid.replace('_', ' ')), 10)
        for model_id in model_ids:
            mf_influence_type_to_tuo_distance_and_similarity = defaultdict(list)
            for line_count, (location, tuo_neighbor_location_and_mf_influence_type_and_similarity) in \
                    enumerate(FileIO.iterateJsonFromFile(tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id)):
                print line_count
                for neighbor_location, mf_influence_type_to_similarity in \
                        tuo_neighbor_location_and_mf_influence_type_and_similarity:
                    distance = getHaversineDistance(getLocationFromLid(location.replace('_', ' ')), getLocationFromLid(neighbor_location.replace('_', ' ')))
                    distance = int(distance)/distance_accuracy*distance_accuracy + distance_accuracy
                    for influence_type, similarity in mf_influence_type_to_similarity.iteritems():
                        mf_influence_type_to_tuo_distance_and_similarity[influence_type].append([distance, similarity])
            subpot_id = 211
            for influence_type in \
                    [InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE, InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE]:
                tuo_distance_and_similarity = mf_influence_type_to_tuo_distance_and_similarity[influence_type]
                tuo_distance_and_similarities =  [(distance, zip(*ito_tuo_distance_and_similarity)[1])
                                                    for distance, ito_tuo_distance_and_similarity in groupby(
                                                            sorted(tuo_distance_and_similarity, key=itemgetter(0)),
                                                            key=itemgetter(0)
                                                        )
                                                ]
                plt.subplot(subpot_id)
                x_distances, y_similarities = [], []
                for distance, similarities in tuo_distance_and_similarities:
#                    similarities=filter_outliers(similarities)
                    x_distances.append(distance), y_similarities.append(np.mean(similarities))
    #            x_distances, y_similarities = splineSmooth(x_distances, y_similarities)
                plt.semilogy(x_distances, y_similarities, c = InfluenceMeasuringModels.INFLUENCE_PROPERTIES[influence_type]['color'], 
                         lw=2, marker = InfluenceMeasuringModels.INFLUENCE_PROPERTIES[influence_type]['marker'])
                plt.ylabel(InfluenceMeasuringModels.INFLUENCE_PROPERTIES[influence_type]['label'], fontsize=13)
                subpot_id+=1
            plt.xlabel('Distance (Miles)', fontsize=13)
#            plt.show()
            savefig('images/%s.png'%(GeneralMethods.get_method_id()))
Exemplo n.º 24
0
def performanceWithSpamDetectionVaryingPercentageOfSpammers(generateData):
    experimentData = defaultdict(dict)
    ratios = [0.0,0.4,0.9]
    marker = dict([(0.0, 's'), (0.4, 'o'), (0.9, 'd')])
#    spammerPercentages = [0.2, 0.01, 0.01]
#    spammerPercentages = [0.015, 0.015, 0.015]
    for iteration in range(10):
        l1 = [spammerPercentage* 0.001 for spammerPercentage in range(1,51)]
        l2 = [spammerPercentage* 0.05 for spammerPercentage in range(1,21)]
        l3 = [0.01]+l2
        spammer_percentages = l3
        for spammerPercentage in spammer_percentages:
            for spamDetectionRatio, spammerPercentage in zip(ratios, [spammerPercentage]*3):
                experimentFileName = spamModelFolder+'performanceWithSpamDetectionVaryingPercentageOfSpammers/%s/%0.3f/%0.3f'%(iteration,spammerPercentage, spamDetectionRatio)
                print experimentFileName
                if generateData:
                    model = MixedUsersModel()
                    conf = {'model': model, 'numberOfTimeSteps': 10, 'addUsersMethod': User.addUsersUsingRatioWithSpamDetection, 'analysisMethods': [(Analysis.measureRankingQuality, 1)], 'ratio': {'normal': 1-spammerPercentage, 'spammer': spammerPercentage},
        #                        'spammerMessagingProbability': spammerBudget,
                            'rankingMethods':[RankingModel.latestMessages, RankingModel.latestMessagesSpamFiltered, RankingModel.popularMessages, RankingModel.popularMessagesSpamFiltered],
                            'spamDetectionRatio': spamDetectionRatio,
                            'experimentFileName': experimentFileName}
                    GeneralMethods.runCommand('rm -rf %s'%experimentFileName);run(**conf)
                else:
#                    for data in FileIO.iterateJsonFromFile(experimentFileName):
#                        for ranking_id in data['spammmess']:
#                            if data['currentTimeStep'] not in experimentData[spamDetectionRatio]: experimentData[spamDetectionRatio][data['currentTimeStep']]=defaultdict(list)
#                            experimentData[spamDetectionRatio][data['currentTimeStep']][ranking_id]+=data['spammmess'][ranking_id]
                            
                    tempData = defaultdict(list)
                    for data in FileIO.iterateJsonFromFile(experimentFileName):
                        for ranking_id in data['spammmess']:
                            tempData[ranking_id]+=data['spammmess'][ranking_id]
                    if spammerPercentage not in experimentData[spamDetectionRatio]: experimentData[spamDetectionRatio][spammerPercentage]=defaultdict(list)
                    for ranking_id in tempData:
                        experimentData[spamDetectionRatio][spammerPercentage][ranking_id]+=tempData[ranking_id]
    if not generateData:
        sdr = {}
        for spamDetectionRatio in sorted(experimentData.keys()):
            dataToPlot = defaultdict(list)
#            for spammerPercentage in sorted(experimentData[spamDetectionRatio]):
            for spammerPercentage in spammer_percentages:
                dataToPlot['x'].append(spammerPercentage)
                for ranking_id in experimentData[spamDetectionRatio][spammerPercentage]: dataToPlot[ranking_id].append(np.mean(experimentData[spamDetectionRatio][spammerPercentage][ranking_id]))
            sdr[spamDetectionRatio]=dataToPlot
#        for ranking_id in [RankingModel.LATEST_MESSAGES_SPAM_FILTERED, RankingModel.POPULAR_MESSAGES_SPAM_FILTERED]:
        for ranking_id in [RankingModel.LATEST_MESSAGES, RankingModel.POPULAR_MESSAGES]:
            for spamDetectionRatio in ratios:
                print ranking_id, spamDetectionRatio
#                dataY = smooth(sdr[spamDetectionRatio][ranking_id],8)[:len(sdr[spamDetectionRatio]['x'])]
                dataY = sdr[spamDetectionRatio][ranking_id][:len(sdr[spamDetectionRatio]['x'])]
#                dataX, dataY = sdr[spamDetectionRatio]['x'][10:], dataY[10:]
                dataX, dataY = sdr[spamDetectionRatio]['x'], dataY
#                dataX, dataY = splineSmooth(dataX, dataY)
#                if spamDetectionRatio==0.0: plt.plot([x-10 for x in dataX], dataY, label='%s'%(labels[ranking_id]), lw=1, marker=marker[spamDetectionRatio])
#                else: plt.plot([x-10 for x in dataX], dataY, label='%s (%d'%(labels[ranking_id].replace('Filtering', 'Detection'),spamDetectionRatio*100)+'%)', lw=1, marker=marker[spamDetectionRatio])
                if spamDetectionRatio==0.0: plt.plot(dataX, dataY, label='%s'%(labels[ranking_id]), lw=1, marker=marker[spamDetectionRatio])
                else: plt.plot(dataX, dataY, label='%s after spam detection (%d'%(labels[ranking_id].replace('Filtering', 'Detection'),spamDetectionRatio*100)+'%)', lw=1, marker=marker[spamDetectionRatio])
#            plt.show()
#            plt.xlim(xmax=0.05)
#            plt.ylim(ymax=0.8)
            plt.legend(loc=4)
            plt.xlabel('Time', fontsize=16, fontweight='bold')
            plt.ylabel('Spamness', fontsize=16, fontweight='bold')
#            plt.show()
#            plt.savefig('performanceWithSpamDetectionVaryingPercentageOfSpammers_%s.png'%ranking_id)
            savefig('/Users/krishnakamath/Dropbox/temp/performanceWithSpamDetectionVaryingPercentageOfSpammers_%s.png'%ranking_id)
#            plt.show()
            plt.clf()
Exemplo n.º 25
0
    def norm_iid_vs_locality_measuers():
        TIME_UNIT_IN_SECONDS = 10.*60.
        output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.png'
        ltuo_normalized_iid_and_tuo_prct_of_occurrences_and_entropy_and_focus_and_coverage = \
                    [data for data in FileIO.iterateJsonFromFile(f_norm_iid_spatial_metrics, remove_params_dict=True)]
        x_normalized_iids, y_entropies, y_focuses, y_distance_from_overall_entropy, y_distance_from_overall_focus, y_coverages = \
                                                     zip(*sorted([(data[0]*TIME_UNIT_IN_SECONDS/60, data[1][1], data[1][2], data[1][4], data[1][5], data[1][3]) 
                                                                      for data in 
                                                                        ltuo_normalized_iid_and_tuo_prct_of_occurrences_and_entropy_and_focus_and_coverage
                                                                  ])
                                                        )
        plt.figure(num=None, figsize=(4.3,3))
        plt.subplots_adjust(bottom=0.2, top=0.9)
        plt.subplot(111)
        plt.xlim(xmin=-20, xmax=200)
#        plt.ylim(ymin=0.5, ymax=1.0)
        plt.plot(x_normalized_iids, y_coverages,  lw=1, c='k')
        plt.scatter(x_normalized_iids, y_coverages, lw=0, marker='o', s=50, c='k')
        plt.ylabel('Interval coverage')
        plt.xlabel('Minutes since peak')
        plt.grid(True)
        savefig(output_file_format%'coverage')
        plt.clf() 
        
        plt.figure(num=None, figsize=(4.3,3))
        plt.subplots_adjust(bottom=0.2, top=0.9)
        plt.subplot(111)
        plt.xlim(xmin=-20, xmax=120)
        plt.ylim(ymin=0.55, ymax=0.70)
        plt.plot(x_normalized_iids, y_entropies,  lw=1, c='k')
        plt.scatter(x_normalized_iids, y_entropies, lw=0, marker='o', s=50, c='k')
        plt.ylabel('Interval entropy')
        plt.xlabel('Minutes since peak')
        plt.grid(True)
        savefig(output_file_format%'entropy')
        plt.clf() 
        
        plt.figure(num=None, figsize=(4.3,3))
        plt.subplots_adjust(bottom=0.2, top=0.9)
        plt.subplot(111)
        plt.xlim(xmin=-20, xmax=400)
#        plt.ylim(ymin=1, ymax=3)
        plt.plot(x_normalized_iids, y_distance_from_overall_entropy, lw=1, c='k')                               
        plt.scatter(x_normalized_iids,  y_distance_from_overall_entropy, marker='o', s=50, c='k')
        plt.xlabel('Minutes since peak')
        plt.ylabel('Distance from overall entropy')
        plt.grid(True)
        savefig(output_file_format%'distace_from_overall_entropy')
        plt.clf()   
        
        plt.figure(num=None, figsize=(4.3,3))
        plt.subplots_adjust(bottom=0.2, top=0.9)
        plt.subplot(111)
        plt.xlim(xmin=-20, xmax=120)
        plt.ylim(ymin=0.797, ymax=0.84)
        plt.plot(x_normalized_iids, y_focuses, lw=1, c='k')
        plt.scatter(x_normalized_iids, y_focuses, lw=1, marker='o', s=50, c='k')     
        plt.xlabel('Minutes since peak')
        plt.ylabel('Interval focus')
        plt.grid(True)
        savefig(output_file_format%'focus')
        plt.clf()
        
        plt.figure(num=None, figsize=(4.3,3))
        plt.subplots_adjust(bottom=0.2, top=0.9)
        plt.subplot(111)
        plt.xlim(xmin=-20, xmax=400)
#        plt.ylim(ymin=-0.43, ymax=-0.19)
        plt.plot(x_normalized_iids, y_distance_from_overall_focus, lw=1, c='k')                               
        plt.scatter(x_normalized_iids, y_distance_from_overall_focus, marker='o', s=50, c='k')   
        plt.xlabel('Minutes since peak')
        plt.ylabel('Distance from overall focus')
        plt.grid(True)
        savefig(output_file_format%'distace_from_overall_focus')
Exemplo n.º 26
0
    def coverage_vs_spatial_properties():
        output_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'_%s.png'
        output_text_file_format = fld_data_analysis_results%GeneralMethods.get_method_id()+'/%s.txt'
        data = [d for d in FileIO.iterateJsonFromFile(f_hashtag_spatial_metrics, remove_params_dict=True)]
        keys = ['entropy', 'focus', 'spread', 'hashtag', 'num_of_occurrenes', 'focus']
        ltuo_entropy_focus_coverage_hashtag_occurrence_count_and_focus_location = map(itemgetter(*keys), data)
        ltuo_entropy_focus_coverage_hashtag_occurrence_count_and_focus_location =\
                                        map(
                                              lambda (a,b,c,d,e,f): (a,b[1],c,d,e,f[0]),
                                              ltuo_entropy_focus_coverage_hashtag_occurrence_count_and_focus_location
                                        )
#        ltuo_entropy_focus_coverage_hashtag_occurrence_count_and_focus_location = [(data[2], data[3][1], data[4], data[0], data[1], data[3][0]) for data in iterateJsonFromFile(input_file)]
        mf_coverage_to_entropies = defaultdict(list)
        mf_coverage_to_focuses = defaultdict(list)
        mf_coverage_boundary_to_tuo_entropy_and_focus_and_hashtag_and_occurrence_count_and_focus_location =\
                                                                                                    defaultdict(list)
        total_hashtags = len(ltuo_entropy_focus_coverage_hashtag_occurrence_count_and_focus_location)+0.
        for entropy, focus, coverage, hashtag, occurrence_count, focus_location in\
                ltuo_entropy_focus_coverage_hashtag_occurrence_count_and_focus_location:
            coverage = int(coverage/100)*100+100
            mf_coverage_to_entropies[coverage].append(entropy)
            mf_coverage_to_focuses[coverage].append(focus)
            coverage_boundary = 800
            if 800<coverage<1600: coverage_boundary=1600
            elif 1600<coverage: coverage_boundary=4000
            mf_coverage_boundary_to_tuo_entropy_and_focus_and_hashtag_and_occurrence_count_and_focus_location\
                                [coverage_boundary].append((entropy, focus, hashtag, occurrence_count, focus_location))
        
        for coverage_boundary, ltuo_entropy_and_focus_and_hashtag_and_occurrence_count_and_focus_location in \
                mf_coverage_boundary_to_tuo_entropy_and_focus_and_hashtag_and_occurrence_count_and_focus_location\
                                                                                                        .iteritems():
            ltuo_entropy_and_focus_and_hashtag_and_s_occurrence_count_and_focus_location = \
                sorted(ltuo_entropy_and_focus_and_hashtag_and_occurrence_count_and_focus_location, key=itemgetter(3), reverse=True)
            for entropy, focus, hashtag, occurrence_count, focus_location in \
                    ltuo_entropy_and_focus_and_hashtag_and_s_occurrence_count_and_focus_location:
                FileIO.writeToFileAsJson(
                                         [hashtag, occurrence_count, entropy, focus, focus_location],
                                         output_text_file_format%coverage_boundary
                                        )
            print coverage_boundary,\
                        len(ltuo_entropy_and_focus_and_hashtag_and_occurrence_count_and_focus_location)/total_hashtags
            print 'median entropy: ',\
                        np.median(zip(*ltuo_entropy_and_focus_and_hashtag_and_s_occurrence_count_and_focus_location)[0])
            print 'median focus: ',\
                        np.median(zip(*ltuo_entropy_and_focus_and_hashtag_and_s_occurrence_count_and_focus_location)[1])
#            print 'var entropy: ', np.var(zip(*ltuo_entropy_and_focus_and_hashtag_and_s_occurrence_count_and_focus_location)[0])
#            print 'var focus: ', np.var(zip(*ltuo_entropy_and_focus_and_hashtag_and_s_occurrence_count_and_focus_location)[1])

#            print 'range entropy: ', getOutliersRangeUsingIRQ(zip(*ltuo_entropy_and_focus_and_hashtag_and_s_occurrence_count_and_focus_location)[0])
#            print 'range focus: ', getOutliersRangeUsingIRQ(zip(*ltuo_entropy_and_focus_and_hashtag_and_s_occurrence_count_and_focus_location)[1])
            
        x_coverages, y_entropies = zip(*[(coverage, np.mean(entropies)) 
                                         for coverage, entropies in mf_coverage_to_entropies.iteritems()
                                         if len(entropies) > 250])
        x_coverages, y_focuses = zip(*[(coverage, np.mean(focuses)) 
                                         for coverage, focuses in mf_coverage_to_focuses.iteritems()
                                         if len(focuses) > 250])
        plt.figure(num=None, figsize=(4.3,3))
        ax = plt.subplot(111)
        plt.subplots_adjust(bottom=0.2, top=0.9, left=0.15)
        plt.scatter(x_coverages, y_entropies, lw=0, marker='o', c='k', s=25)
#        plt.ylim(ymax=1.2)
        plt.xlabel('Spread (miles)')
        plt.ylabel('Entropy')
#        ax.set_xscale('log')
        plt.grid(True)
        savefig(output_file_format%'entropy')
        
        plt.figure(num=None, figsize=(4.3,3))
        ax = plt.subplot(111)
        plt.subplots_adjust(bottom=0.2, top=0.9, left=0.15)
        plt.scatter(x_coverages, y_focuses, lw=0, marker='o', c='k', s=25)
#        plt.ylim(ymax=1.2)
        plt.xlabel('Spread (miles)')
        plt.ylabel('Focus')
#        ax.set_xscale('log')
        plt.grid(True)
        savefig(output_file_format%'focus')