def evaluateCorpus(self, printmsg=False):
        distances = []
        valids = 0
        invalids = 0

        distance_matches = 0
        distance_mismatches = 0

        cluster_matches = 0
        cluster_mismatches = 0
        
        n = len(self.clusters)
        real_to_calc_matches = [[0 for x in range(n+1)] for x in range(n)] 
        for i in range(n):
            real_to_calc_matches[i][0] = i

        for self.i in range(0,self.n):
            values = self.evaluateTweet(self.tweets[self.i], self.location[self.i], self.users[self.i])
            if values is None:
                invalids += 1
            else:
                lon_calculated, lat_calculated, lon_real, lat_real, distance = values
                distances.append(distance)

                if EvaluationFunctions.evaluateDistance(distance, self.distance_threshold):
                    distance_matches += 1
                else:
                    distance_mismatches += 1

                if EvaluationFunctions.evaluateCluster(lon_calculated, lat_calculated, lon_real, lat_real, self.clusters, real_to_calc_matches):
                    cluster_matches += 1
                else:
                    cluster_mismatches += 1

                valids += 1

        distances_np = np.asarray(distances, dtype=float)
        if printmsg:
            print 'valid: ', valids, 'invalid: ', invalids

            print 'distance_match: ', distance_matches, 'distance_mismatches: ', distance_mismatches
            if distance_matches + distance_mismatches > 0:
                print 'distance_ratio: ', str(float(distance_matches) / (distance_matches + distance_mismatches))

            print 'cluster_matches: ', cluster_matches, 'cluster_mismatches: ', cluster_mismatches
            if cluster_matches + cluster_mismatches > 0:
                print 'cluster_ratio: ', str(float(cluster_matches) / (cluster_matches + cluster_mismatches))

        # print "not used: ", self.tmpscore
        #print tabulate(real_to_calc_matches, tablefmt="latex",headers=range(n))
        
        #print tabulate(EvaluationFunctions.transformStatistice(real_to_calc_matches), tablefmt="latex",headers=range(n))
        
        valid_ratio = valids / float(valids + invalids)
        valid_ratio = valids
        if valids > 0:
            return  (np.mean(distances_np), np.median(distances_np), valid_ratio)
        else:
            return  (float('inf'), float('inf'), valid_ratio)
示例#2
0
    def createFallback(self):
        # create fall-back tokens for all users
        for user, tweets in self.user_to_tweets.iteritems():
            tid_to_count = {}
            for tweet in tweets:
                for token in EvaluationFunctions.getCoOccurrences(tweet):
                    tid = self.signature.add(token)
                    if tid in self.token_data:
                        if self.checkVarianceThreshold(tid):
                            tid_to_count.setdefault(tid, 0)
                            tid_to_count[tid] += 1

            amount = 5
            if len(tid_to_count) < amount:
                amount = len(tid_to_count)

            token_data = []
            for tid, count in sorted(tid_to_count.iteritems(),
                                     key=itemgetter(1),
                                     reverse=True)[:amount]:
                data = self.token_data[tid]
                variance = data['variance']
                count = data['count']
                token_data.append((self.signature.get(tid), variance, count,
                                   data["median"], data["variances"]))

            self.fallback[user] = token_data
示例#3
0
def pickleTrainingCorpus(filename):
    token_to_data = {
    }  #< maps a token to a tuple of its coordinates,  variance and its count
    #< ((lon, lat), variance, count)
    COUNT_THRESHOLD = 0

    # Make connection
    database = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() +
                                                      "/",
                                                      corpus="TRAIN")

    # Iterate over all tweets and split the tokenised texts.
    # Each token maps to a list of lon, lat tuples
    token_distribution_cart = {}
    tweet_coordinates = []
    for tokens, lat, lon in database.getRows("`tokenised_low`, `lat`, `long`"):
        tweet_coordinates.append((lon, lat))
        cartesian = EvaluationFunctions.convertLatLongToCartesian(lon, lat)
        for token in EvaluationFunctions.getCoOccurrences(tokens.split()):
            token_distribution_cart.setdefault(token, []).append(cartesian)

    for token, coordinates_of_tuple in token_distribution_cart.iteritems():
        count = len(coordinates_of_tuple)
        if count > COUNT_THRESHOLD:
            # Convert coordinate list to numpy array
            np_list = np.asarray(coordinates_of_tuple, dtype=float)

            # Calculate the mean values for
            (mean_x, mean_y, mean_z) = tuple(np.mean(np_list, axis=0))

            variance_num = 0
            for (x, y, z) in coordinates_of_tuple:
                variance_num += (x - mean_x)**2 + (y - mean_y)**2 + (z -
                                                                     mean_z)**2

            # Calculate the variance
            variance = variance_num / count

            # calculate the median
            (mean_x, mean_y, mean_z) = tuple(np.median(np_list, axis=0))

            token_to_data[token] = (
                EvaluationFunctions.convertCartesianToLatLong(
                    mean_x, mean_y, mean_z), variance, count)

    pickle.dump(token_to_data, open(filename, 'wb'))
    return tweet_coordinates
def pickleTrainingCorpus(filename):
    token_to_data = {}    #< maps a token to a tuple of its coordinates,  variance and its count
                          #< ((lon, lat), variance, count)
    COUNT_THRESHOLD = 0

    # Make connection
    database = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() + "/", corpus="TRAIN")

    # Iterate over all tweets and split the tokenised texts.
    # Each token maps to a list of lon, lat tuples
    token_distribution_cart = {}
    tweet_coordinates = []
    for tokens, lat, lon in database.getRows("`tokenised_low`, `lat`, `long`"):
        tweet_coordinates.append((lon, lat))
        cartesian = EvaluationFunctions.convertLatLongToCartesian(lon, lat)
        for token in EvaluationFunctions.getCoOccurrences(tokens.split()):
            token_distribution_cart.setdefault(token, []).append(cartesian)

    for token, coordinates_of_tuple in token_distribution_cart.iteritems():
        count = len(coordinates_of_tuple)
        if count > COUNT_THRESHOLD:
            # Convert coordinate list to numpy array
            np_list = np.asarray(coordinates_of_tuple, dtype=float)

            # Calculate the mean values for
            (mean_x, mean_y, mean_z) = tuple(np.mean(np_list, axis=0))

            variance_num = 0
            for (x, y, z) in coordinates_of_tuple:
                variance_num += (x - mean_x)**2 + (y - mean_y)**2 + (z - mean_z)**2

            # Calculate the variance
            variance = variance_num / count

            # calculate the median
            (mean_x, mean_y, mean_z) = tuple(np.median(np_list, axis=0))


            token_to_data[token] = (EvaluationFunctions.convertCartesianToLatLong(mean_x, mean_y, mean_z), variance, count)

    pickle.dump(token_to_data, open(filename, 'wb'))
    return tweet_coordinates
示例#5
0
    def __init__(self, signature=None, clusters=None, corpus='DEV'):
        self.tweets = []  # list of tokenised tweets
        self.location = []  # list of lat, lan tuples
        self.n = 0  # The size of the corpus
        self.clusters = None  # List of centroid coordinates
        self.variance_threshold = 0
        self.distance_threshold = 0
        self.draw = False  # Toggle weather each tweet should be saved to a PNG file
        self.evaluator = None  # Creates the weights for the tokens in a tweet
        self.null = False  # Test 0-hypothesis
        self.signature = signature
        self.clusters = clusters
        self.users = []
        self.fallback = {}

        # Load corpus from database:
        database = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() +
                                                          "/",
                                                          corpus=corpus)
        self.user_to_tweets = {}

        for tokens, lat, lon, user in database.getRows(
                "`tokenised_low`, `lat`, `long`, `user_id`"):
            self.tweets.append(tokens.split())
            self.users.append(user)
            self.user_to_tweets.setdefault(user, []).append(tokens.split())
            self.location.append((lon, lat))
        self.n = len(self.tweets)
        assert len(self.tweets) == len(self.location)

        # Lookup tokendata
        self.token_data = {}

        # collect ids
        ids = []
        for tweet in self.tweets:
            for token in EvaluationFunctions.getCoOccurrences(tweet):
                i = self.signature.add(token)
                ids.append(i)
        ids = set(ids)
        # Get data from database
        token_db = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() +
                                                          "/",
                                                          corpus="TOKENDATA")
        for token_id, medianx, mediany, medianz, variance, variancex, variancey, variancez, count \
            in token_db.getTokenInfo(ids, columns= \
            "`id`, `median_x`, `median_y`, `median_z`, `variance`, `variance_x`, `variance_y`, `variance_z`, `count`"):
            self.token_data[token_id] = {
                "median": (medianx, mediany, medianz),
                "variance": variance,
                "count": count,
                "variances": (variancex, variancey, variancez)
            }
    def __init__(self, signature=None, clusters=None, corpus='DEV'):
        self.tweets = []        # list of tokenised tweets
        self.location = []      # list of lat, lan tuples
        self.n = 0              # The size of the corpus
        self.clusters = None    # List of centroid coordinates
        self.variance_threshold = 0
        self.distance_threshold = 0
        self.draw = False       # Toggle weather each tweet should be saved to a PNG file
        self.evaluator = None   # Creates the weights for the tokens in a tweet
        self.null = False       # Test 0-hypothesis
        self.signature = signature
        self.clusters = clusters
        self.users = []
        self.fallback = {}

        # Load corpus from database:
        database = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() + "/", corpus=corpus)
        self.user_to_tweets = {}

        for tokens, lat, lon, user in database.getRows("`tokenised_low`, `lat`, `long`, `user_id`"):
            self.tweets.append(tokens.split())
            self.users.append(user)
            self.user_to_tweets.setdefault(user, []).append(tokens.split())
            self.location.append((lon, lat))
        self.n = len(self.tweets)
        assert len(self.tweets) == len(self.location)

        # Lookup tokendata
        self.token_data = {}

        # collect ids
        ids = []
        for tweet in self.tweets:
            for token in EvaluationFunctions.getCoOccurrences(tweet):
                i = self.signature.add(token)
                ids.append(i)
        ids = set(ids)
        # Get data from database
        token_db = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() + "/", corpus="TOKENDATA")
        for token_id, medianx, mediany, medianz, variance, variancex, variancey, variancez, count \
            in token_db.getTokenInfo(ids, columns= \
            "`id`, `median_x`, `median_y`, `median_z`, `variance`, `variance_x`, `variance_y`, `variance_z`, `count`"):
                    self.token_data[token_id] = {
                        "median" : (medianx, mediany, medianz),
                        "variance" : variance,
                        "count" : count,
                        "variances" : (variancex, variancey, variancez)}
    def createFallback(self):
       # create fall-back tokens for all users
        for user, tweets in self.user_to_tweets.iteritems():
            tid_to_count = {}
            for tweet in tweets:
                for token in EvaluationFunctions.getCoOccurrences(tweet):
                    tid = self.signature.add(token)
                    if tid in self.token_data:
                        if self.checkVarianceThreshold(tid):
                            tid_to_count.setdefault(tid,0)
                            tid_to_count[tid] += 1

            amount = 5
            if len(tid_to_count) < amount:
                amount = len(tid_to_count)

            token_data = []
            for tid, count in sorted(tid_to_count.iteritems(), key=itemgetter(1), reverse=True)[:amount]:
                data = self.token_data[tid]
                variance = data['variance']
                count = data['count']
                token_data.append((self.signature.get(tid), variance, count, data["median"], data["variances"]))

            self.fallback[user] = token_data
cluster = int(sys.argv[3])

token_to_data_filtered = {}

# Filter by count
for token, items in token_to_data.iteritems():
    if items[2] >= COUNT_THRESHOLD:
        token_to_data_filtered[token] = items

if cluster != -1: # Print the common words (for all of DE,AT,CH)
    clusters = pickle.load(open(sys.argv[2], 'rb')) #<

    token_to_data_filtered_cluster = {}
    for token, items in token_to_data_filtered.iteritems():
        lon, lat = items[0]
        c = EvaluationFunctions.getCluster(lon,lat, clusters)
        if c == cluster:
            token_to_data_filtered_cluster[token] = items

    token_to_data_filtered = token_to_data_filtered_cluster


sorted_by_variance = sorted(token_to_data_filtered.iteritems(),key=lambda x: x[1][1], reverse=True)

num_tokens = len(sorted_by_variance)
# sort by variance
for token in sorted_by_variance:
    print token[0] + " (" + str(token[1][2]) + ")"
print num_tokens

示例#9
0
    def evaluateCorpus(self, printmsg=False):
        distances = []
        valids = 0
        invalids = 0

        distance_matches = 0
        distance_mismatches = 0

        cluster_matches = 0
        cluster_mismatches = 0

        n = len(self.clusters)
        real_to_calc_matches = [[0 for x in range(n + 1)] for x in range(n)]
        for i in range(n):
            real_to_calc_matches[i][0] = i

        for self.i in range(0, self.n):
            values = self.evaluateTweet(self.tweets[self.i],
                                        self.location[self.i],
                                        self.users[self.i])
            if values is None:
                invalids += 1
            else:
                lon_calculated, lat_calculated, lon_real, lat_real, distance = values
                distances.append(distance)

                if EvaluationFunctions.evaluateDistance(
                        distance, self.distance_threshold):
                    distance_matches += 1
                else:
                    distance_mismatches += 1

                if EvaluationFunctions.evaluateCluster(lon_calculated,
                                                       lat_calculated,
                                                       lon_real, lat_real,
                                                       self.clusters,
                                                       real_to_calc_matches):
                    cluster_matches += 1
                else:
                    cluster_mismatches += 1

                valids += 1

        distances_np = np.asarray(distances, dtype=float)
        if printmsg:
            print 'valid: ', valids, 'invalid: ', invalids

            print 'distance_match: ', distance_matches, 'distance_mismatches: ', distance_mismatches
            if distance_matches + distance_mismatches > 0:
                print 'distance_ratio: ', str(
                    float(distance_matches) /
                    (distance_matches + distance_mismatches))

            print 'cluster_matches: ', cluster_matches, 'cluster_mismatches: ', cluster_mismatches
            if cluster_matches + cluster_mismatches > 0:
                print 'cluster_ratio: ', str(
                    float(cluster_matches) /
                    (cluster_matches + cluster_mismatches))

        # print "not used: ", self.tmpscore
        #print tabulate(real_to_calc_matches, tablefmt="latex",headers=range(n))

        #print tabulate(EvaluationFunctions.transformStatistice(real_to_calc_matches), tablefmt="latex",headers=range(n))

        valid_ratio = valids / float(valids + invalids)
        valid_ratio = valids
        if valids > 0:
            return (np.mean(distances_np), np.median(distances_np),
                    valid_ratio)
        else:
            return (float('inf'), float('inf'), valid_ratio)
示例#10
0
    def evaluateTweet(self, tokens, location, user):
        token_data_here = []

        valid = 0
        if self.draw:
            basemap = MapFunctions.prepareMap()

        text_pos = 1890000

        # Look up the data for each token in the tweet
        for token in EvaluationFunctions.getCoOccurrences(tokens):
            token_id = self.signature.add(token)
            if token_id not in self.token_data:
                if False:  #self.draw:
                    plt.text(10000,
                             text_pos,
                             token.decode('utf8', 'ignore') + ' | (fail)',
                             color='grey',
                             fontsize=6)
                    text_pos -= 42000
                continue

            data = self.token_data[token_id]
            variance = data['variance']
            count = data['count']
            x, y, z = data["median"]
            lon, lat = EvaluationFunctions.convertCartesianToLatLong(x, y, z)
            if self.checkVarianceThreshold(token_id):
                valid += 1
                # 0-hypothese
                if self.null:
                    token = self.token_data.keys()[randint(
                        0, len(self.token_data.keys()))]
                    coordinates, variance, count = self.token_data[token]

                if self.draw:
                    #plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | ' + str(round(variance,1)) + ' | ' + str(count), color='black', fontsize=6)
                    text_pos -= 42000
                    current_color = EvaluationFunctions.getColorForValue(
                        variance)
                    basemap.plot(
                        lon,
                        lat,
                        'o',
                        latlon=True,
                        markeredgecolor=current_color,
                        color=current_color,
                        markersize=EvaluationFunctions.getSizeForValue(count),
                        alpha=0.7)

                token_data_here.append((token, variance, count, data["median"],
                                        data["variances"]))

            else:
                if self.draw:
                    #plt.text(10000, text_pos,   token.decode('utf8', 'ignore') + ' | ' + str(round(variance,1)) + ' | ' + str(count),color='grey', fontsize=6)
                    text_pos -= 40000
                    current_color = 'gray'
                    basemap.plot(
                        lon,
                        lat,
                        'o',
                        latlon=True,
                        markeredgecolor=current_color,
                        color=current_color,
                        markersize=EvaluationFunctions.getSizeForValue(count),
                        alpha=0.1)

        if valid == 0:
            # use fallback
            #if user in self.fallback:
            #    token_data_here = self.fallback[user]
            #else:
            #    print user , " not in " , self.fallback.keys()
            if len(token_data_here) == 0:
                plt.clf()
                return None
            #else:
            #    print "!"

        # Generate the data for the weighted midpoint
        coordinate_list, weight_list = self.evaluator.evaluate(token_data_here)

        # Calculate the midpoint
        lon_score, lat_score = EvaluationFunctions.getWeightedMidpointXYZ(
            coordinate_list, weight_list)

        distance = EvaluationFunctions.getDistance(lon_score, lat_score,
                                                   location[0], location[1])

        #print " ".join(tokens)
        #print distance
        #print valid
        #print ""

        if self.draw:
            basemap.plot(location[0],
                         location[1],
                         '^',
                         mfc='none',
                         markeredgecolor='black',
                         latlon=True,
                         alpha=1)
            basemap.plot(lon_score,
                         lat_score,
                         'v',
                         mfc='none',
                         markeredgecolor='black',
                         latlon=True,
                         alpha=1)

            plt.text(10000, 10000,
                     'Distance: ' + str(round(distance, 1)) + 'km')
            plt.text(10000, 80000,
                     'Threshold: ' + str(self.variance_threshold))
            plt.savefig('img/tweet_' + str(self.variance_threshold) + "_" +
                        str(self.i) + ".png",
                        format='png')
            plt.clf()

        return (lon_score, lat_score, location[0], location[1], distance)
示例#11
0
import cPickle as pickle
from Wrapper import MySQLConnection
import os
from Evaluation import EvaluationFunctions
"""
Print the most regional tokens for a given cluster.

Usage:
python PrintRegionalTokens.py Signature Lon Lat range

"""

if len(sys.argv) < 4:
    print "1. TokenData, 2. ClusterData, 3. Cluster to analyse"
    sys.exit(1)

signature = pickle.load(open(sys.argv[1], 'rb'))
lon = sys.argv[2]
lat = sys.argv[2]
rang = 50 #km

token_to_data = {}
token_db = MySQLConnection.MySQLConnectionWrapper(basedir=os.getcwd() + "/", corpus="TOKENDATA")

for tid, count, medx, medy, medz, varx, vary, varz in token_db.getTokenInfo(ids=None, columns="`id`, `count`, `median_x`, `median_y`, `median_z`, `variance_x`, `variance_y`, `variance_z`"):
    lon_, lat_ = EvaluationFunctions.convertCartesianToLatLong(medx, medy, medz)
    distance = EvaluationFunctions.getDistance(lon, lat, lon_, lat_)
    if distance < rang and count > 20:
        print signature.get(tid), ",", (varx,vary,varz), ",", count

        pos -= 1
    x = variances_x[pos]
    y = variances_y[pos]
    z = variances_z[pos]
    return (x,y,z)

def checkVarianceThreshold((x,y,z)):
        (tx,ty,tz) = VARIANCE_THRESHOLD
        return x < tx and y < ty and z < tz

""" EVALUATE """
# Sort by variance in the token data
for i in range (1,100):
    i += 1
    l = i / 100.0
    COUNT_THRESHOLD = 10
    VARIANCE_THRESHOLD = getThreshold(l)

    # Collect data
    for tid, (medx, medy, medz, vars, count) in token_to_data.iteritems():
        if count > COUNT_THRESHOLD and checkVarianceThreshold(vars):
            coordinates_to_draw.append(EvaluationFunctions.convertCartesianToLatLong(medx, medy, medz))

    pickle.dump(coordinates_to_draw, open(sys.argv[1] + "_" + str(l) + ".pickle", 'wb'))
#
# # Draw coordinates to the map:
# for lon, lat in coordinates_to_draw:
#     basemap.plot(lon, lat, '.r', markeredgecolor='r', markersize=1,latlon=True)
#
# plt.savefig(sys.argv[1], format='png', bbox_inches='tight', dpi=900)
    def evaluateTweet(self, tokens, location, user):
        token_data_here = []

        valid = 0
        if self.draw:
            basemap = MapFunctions.prepareMap()

        text_pos = 1890000
       

        # Look up the data for each token in the tweet
        for token in EvaluationFunctions.getCoOccurrences(tokens):
            token_id =  self.signature.add(token)
            if token_id not in self.token_data:
                if False: #self.draw:
                    plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | (fail)', color='grey', fontsize=6)
                    text_pos -= 42000
                continue

            data = self.token_data[token_id]
            variance = data['variance']
            count = data['count']
            x,y,z = data["median"]
            lon, lat = EvaluationFunctions.convertCartesianToLatLong(x,y,z)
            if self.checkVarianceThreshold(token_id):
                valid += 1
                # 0-hypothese
                if self.null:
                    token = self.token_data.keys()[randint(0,len(self.token_data.keys()))]
                    coordinates, variance, count = self.token_data[token]

                if self.draw:
                    #plt.text(10000, text_pos, token.decode('utf8', 'ignore') + ' | ' + str(round(variance,1)) + ' | ' + str(count), color='black', fontsize=6)
                    text_pos -= 42000
                    current_color = EvaluationFunctions.getColorForValue(variance)
                    basemap.plot(lon, lat, 'o', latlon=True, markeredgecolor=current_color, color=current_color, markersize=EvaluationFunctions.getSizeForValue(count), alpha=0.7)

                token_data_here.append((token, variance, count, data["median"], data["variances"]))

            else:
                if self.draw:
                    #plt.text(10000, text_pos,   token.decode('utf8', 'ignore') + ' | ' + str(round(variance,1)) + ' | ' + str(count),color='grey', fontsize=6)
                    text_pos -= 40000
                    current_color = 'gray'
                    basemap.plot(lon, lat, 'o', latlon=True, markeredgecolor=current_color, color=current_color, markersize=EvaluationFunctions.getSizeForValue(count), alpha=0.1)

        if valid == 0:
            # use fallback
            #if user in self.fallback:
            #    token_data_here = self.fallback[user]
            #else:
            #    print user , " not in " , self.fallback.keys()
            if len(token_data_here) == 0:
                plt.clf()
                return None
            #else:
            #    print "!"


        # Generate the data for the weighted midpoint
        coordinate_list, weight_list = self.evaluator.evaluate(token_data_here)

        # Calculate the midpoint
        lon_score, lat_score = EvaluationFunctions.getWeightedMidpointXYZ(coordinate_list, weight_list)

        distance = EvaluationFunctions.getDistance(lon_score, lat_score, location[0], location[1])
        
        #print " ".join(tokens)
        #print distance
        #print valid
        #print ""

        if self.draw:
            basemap.plot(location[0], location[1], '^', mfc='none' , markeredgecolor='black', latlon=True, alpha=1)
            basemap.plot(lon_score, lat_score, 'v',  mfc='none',  markeredgecolor='black', latlon=True, alpha=1)
           
            plt.text(10000,10000,'Distance: '+ str(round(distance,1)) + 'km')
            plt.text(10000,80000, 'Threshold: ' + str(self.variance_threshold))
            plt.savefig('img/tweet_' + str(self.variance_threshold) + "_" + str(self.i) + ".png", format='png')
            plt.clf()

        return (lon_score, lat_score, location[0], location[1], distance)