def iterateFrequentLocationsFromFIMahout(
     minLocationsTheUserHasCheckedin,
     minUniqueUsersCheckedInTheLocation,
     minCalculatedSupport,
     minLocationsInItemset=0,
     extraMinSupport=minSupport,
     yieldSupport=False,
     lids=False,
 ):
     #        for line in FileIO.iterateLinesFromFile(locationsFIMahoutOutputFile%(minUserLocations, minCalculatedSupport)):
     for line in FileIO.iterateLinesFromFile(
         locationsFIMahoutOutputFile
         % (minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, minCalculatedSupport)
     ):
         if line.startswith("Key:"):
             data = line.split("Value: ")[1][1:-1].split(",")
             if not lids:
                 locationItemset, support = (
                     [getLocationFromLid(i.replace("_", " ")) for i in data[0][1:-1].split()],
                     int(data[1]),
                 )
             else:
                 locationItemset, support = [i.replace("_", " ") for i in data[0][1:-1].split()], int(data[1])
             if support >= extraMinSupport and len(locationItemset) >= minLocationsInItemset:
                 if not yieldSupport:
                     yield [location for location in locationItemset if isWithinBoundingBox(location, us_boundary)]
                 else:
                     yield [
                         location
                         for location in locationItemset
                         if isWithinBoundingBox(getLocationFromLid(location), us_boundary)
                     ], support
Пример #2
0
 def load_tuo_location_and_boundary_influence_score(model_id, hashtag_tag, boundary=[[-90,-180], [90, 180]], noOfInfluencers=None):
     mf_location_to_global_influence_score = {}
     mf_location_to_mf_influence_type_to_influence_score = defaultdict(dict)
     mf_location_to_tuo_neighbor_location_and_locations_influencing_score = \
         dict(Experiments.load_tuo_location_and_tuo_neighbor_location_and_locations_influence_score(model_id, hashtag_tag, noOfInfluencers, InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE))
     mf_location_to_tuo_neighbor_location_and_locations_influenced_score = \
         dict(Experiments.load_tuo_location_and_tuo_neighbor_location_and_locations_influence_score(model_id, hashtag_tag, noOfInfluencers, InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE))
     for location in mf_location_to_tuo_neighbor_location_and_locations_influenced_score.keys()[:]:
         if not isWithinBoundingBox(getLocationFromLid(location.replace('_', ' ')), boundary):
             if location in mf_location_to_tuo_neighbor_location_and_locations_influencing_score:
                 del mf_location_to_tuo_neighbor_location_and_locations_influencing_score[location]
             del mf_location_to_tuo_neighbor_location_and_locations_influenced_score[location]
     no_of_locations = len(mf_location_to_tuo_neighbor_location_and_locations_influenced_score)
     for location, tuo_neighbor_location_and_locations_influencing_score in \
             mf_location_to_tuo_neighbor_location_and_locations_influencing_score.iteritems():
         mf_location_to_mf_influence_type_to_influence_score[location][InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE] \
             = sum(zip(*tuo_neighbor_location_and_locations_influencing_score)[1])/no_of_locations
     for location, tuo_neighbor_location_and_locations_influenced_score in \
             mf_location_to_tuo_neighbor_location_and_locations_influenced_score.iteritems():
         mf_location_to_mf_influence_type_to_influence_score[location][InfluenceMeasuringModels.TYPE_OUTGOING_INFLUENCE] \
             = sum(zip(*tuo_neighbor_location_and_locations_influenced_score)[1])/no_of_locations
     for location, mf_influence_type_to_influence_score in \
             mf_location_to_mf_influence_type_to_influence_score.iteritems():
         influence_type, influence_score = max(mf_influence_type_to_influence_score.iteritems(), key=itemgetter(1))
         if influence_type==InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE: 
             mf_location_to_global_influence_score[location] = -influence_score
         else: mf_location_to_global_influence_score[location] = influence_score
     return mf_location_to_global_influence_score.items()
 def mapper(self, key, line):
     data = parseData(line)
     if data and isWithinBoundingBox(data['l'], boundary): 
         del data['_id']
         data['t'] = time.mktime(data['t'].timetuple())
         data['lid'] = getLidFromLocation(data['l'])
         data['llid'] = getLatticeLid(data['l'], accuracy=0.015)
         yield data, 1
Пример #4
0
def iterateHashtagObjectInstances(line):
    data = cjson.decode(line)
    l = None
    if 'geo' in data: l = data['geo']
    else: l = data['bb']
    t =  GeneralMethods.approximateEpoch(time.mktime(getDateTimeObjectFromTweetTimestamp(data['t']).timetuple()), TIME_UNIT_IN_SECONDS)
    if isWithinBoundingBox(l, BOUNDARY):
        point = getLatticeLid(l, LATTICE_ACCURACY)
        if point!='0.0000_0.0000':
            for h in data['h']: yield h.lower(), [point, t]
Пример #5
0
def getHashtagWithoutEndingWindow(key, values, specificToArea=False):
    occurences = []
    for instances in values: 
        if not specificToArea: occurences+=instances['oc']
        else:
            MIN_HASHTAG_OCCURENCES = AREA_DETAILS[1]
            for oc in instances['oc']:
                if isWithinBoundingBox(oc[0], AREA_DETAILS[0]): occurences.append(oc)
    if occurences:
        e, l = min(occurences, key=lambda t: t[1]), max(occurences, key=lambda t: t[1])
        numberOfInstances=len(occurences)
        if numberOfInstances>=MIN_HASHTAG_OCCURENCES and \
            e[1]>=HASHTAG_STARTING_WINDOW: return {'h': key, 't': numberOfInstances, 'e':e, 'l':l, 'oc': sorted(occurences, key=lambda t: t[1])}
Пример #6
0
 def analyzeDataClusters():
     regex = 'cafe'
     neighborLocationExtractionMethod = NeighborLocationsSelection.N_LOCATIONS
     inputFile = checkinSequenceLocationRegexAnalysisFolder+neighborLocationExtractionMethod+'/'+regex
     for line in FileIO.iterateJsonFromFile(inputFile):
         if line['parameters']['checkinsWindow']==10:
             for location, data in line['locations'].iteritems():
 #                data = line['locations']['41.895 -87.623']
                 if isWithinBoundingBox(getLocationFromLid(location), us_boundary):
                     print venuesCollection.find_one({'lid': location})['n'], location,'\n'
                     for l, _ in data['clusters'][:5]:
                         print [i[0] for i in l]
                     print '\n ********** \n'
Пример #7
0
    def influence_clusters(model_ids, min_cluster_size=15):
        influence_type = InfluenceMeasuringModels.TYPE_INCOMING_INFLUENCE
        for model_id in model_ids:
            digraph_of_location_and_location_similarity = nx.DiGraph()
            for line_count, (location, tuo_neighbor_location_and_mf_influence_type_and_similarity) in \
                        enumerate(FileIO.iterateJsonFromFile(tuo_location_and_tuo_neighbor_location_and_mf_influence_type_and_similarity_file%model_id)):
#                print line_count
                for neighbor_location, mf_influence_type_to_similarity in tuo_neighbor_location_and_mf_influence_type_and_similarity: 
                    if isWithinBoundingBox(getLocationFromLid(location.replace('_', ' ')), PARTIAL_WORLD_BOUNDARY) and \
                            isWithinBoundingBox(getLocationFromLid(neighbor_location.replace('_', ' ')), PARTIAL_WORLD_BOUNDARY):
                        digraph_of_location_and_location_similarity.add_edge(location, neighbor_location, {'w': mf_influence_type_to_similarity[influence_type]})

            no_of_clusters, tuo_location_and_cluster_id = clusterUsingAffinityPropagation(digraph_of_location_and_location_similarity)
            tuo_cluster_id_to_locations = [ (cluster_id, zip(*ito_tuo_location_and_cluster_id)[0])
                                            for cluster_id, ito_tuo_location_and_cluster_id in 
                                            groupby(
                                                  sorted(tuo_location_and_cluster_id, key=itemgetter(1)),
                                                  key=itemgetter(1)
                                                  )
                                           ]
            mf_location_to_cluster_id = dict(tuo_location_and_cluster_id)
            mf_cluster_id_to_cluster_color = dict([(i, GeneralMethods.getRandomColor()) for i in range(no_of_clusters)])
            mf_valid_locations_to_color = {}
            for cluster_id, locations in \
                    sorted(tuo_cluster_id_to_locations, key=lambda (cluster_id, locations): len(locations))[-10:]:
#                if len(locations)>min_cluster_size:
                print cluster_id, len(locations)
                for location in locations: mf_valid_locations_to_color[location] \
                    = mf_cluster_id_to_cluster_color[mf_location_to_cluster_id[location]]
            locations, colors = zip(*mf_valid_locations_to_color.iteritems())
            locations = [getLocationFromLid(location.replace('_', ' ')) for location in locations]
            _, m = plotPointsOnWorldMap(locations, blueMarble=False, bkcolor='#CFCFCF', c=colors, s=0, returnBaseMapObject=True, lw = 0)
            for u, v, data in digraph_of_location_and_location_similarity.edges(data=True):
                if u in mf_valid_locations_to_color and v in mf_valid_locations_to_color \
                        and mf_location_to_cluster_id[u]==mf_location_to_cluster_id[v]:
                    color, u, v, w = mf_cluster_id_to_cluster_color[mf_location_to_cluster_id[u]], getLocationFromLid(u.replace('_', ' ')), getLocationFromLid(v.replace('_', ' ')), data['w']
                    m.drawgreatcircle(u[1], u[0], v[1], v[0], color=color, alpha=0.6)
            plt.show()
Пример #8
0
 def writeLocationToUserMap(place):
     name, boundary = place['name'], place['boundary']
     GeneralMethods.runCommand('rm -rf %s'%placesLocationToUserMapFile%name)
     for location in filteredLocationToUserAndTimeMapIterator(minLocationsTheUserHasCheckedin, minUniqueUsersCheckedInTheLocation, inputFile=locationToUserAndExactTimeMapFile):
         lid=getLocationFromLid(location['location'])
         if isWithinBoundingBox(lid, boundary): 
             location['categories'] = ''; location['tags'] = ''; location['name']=''
             title = venuesCollection.find_one({'lid':location['location']})
             if title: location['name'] = unicode(title['n']).encode("utf-8")
             meta = venuesMetaDataCollection.find_one({'_id':location['location']})
             if meta: location['categories'] = unicode(meta['c']).encode("utf-8"); location['tags'] = unicode(meta['t']).encode("utf-8")
             for user in location['users'].keys()[:]: location['users'][str(user)]=location['users'][user]; del location['users'][user]
             location['noOfCheckins']=sum([len(epochs) for user, userVector in location['users'].iteritems() for day, dayVector in userVector.iteritems() for db, epochs in dayVector.iteritems()])
             if location['noOfCheckins']>place.get('minLocationCheckins',0): FileIO.writeToFileAsJson(location, placesLocationToUserMapFile%name)
Пример #9
0
 def writeGraphs(regex, neighborLocationExtractionMethod, **kwargs):
     def getLocationName(lid):
         object = venuesCollection.find_one({'lid': lid})
         if object: return object['n']
         else: return lid
     inputFileName = checkinSequenceLocationRegexFolder+regex
     outputFileName = checkinSequenceLocationRegexAnalysisFolder+neighborLocationExtractionMethod+'/'+regex
     for data in FileIO.iterateJsonFromFile(inputFileName):
         if isWithinBoundingBox(getLocationFromLid(data['lid']), world_boundary):
             outputFileName = checkinSequenceLocationRegexAnalysisFolder+neighborLocationExtractionMethod+'/graph/'+regex+'/'+data['lid']+'_%s'%kwargs['checkinsWindow']
             print 'Analyzing:', kwargs['checkinsWindow'], data['lid'], outputFileName
             graph = NeigboringLocationsGraph.getLocationGraph(data,  NeighborLocationsSelection.getMethod(neighborLocationExtractionMethod), **kwargs)
             
             labels, edgeWeights = {}, []
             for u, v in graph.edges()[:]:
                 if u==v: graph.remove_edge(u, v)
                 else: edgeWeights.append((graph.edge[u][v]['w'], (u,v)))
             edgesToRemove = [i[1] for i in sorted(edgeWeights, key=itemgetter(0), reverse=True)[int(len(edgeWeights)*kwargs['percentageOfTopEdgesByWeight']):]]
             for u, v in edgesToRemove: graph.remove_edge(u, v)
 
             for u in graph.nodes(): 
                 if graph.degree(u)==0: graph.remove_node(u)
                 else: labels[u] = unicode(getLocationName(u)).encode('utf-8')
             plot(graph, node_color='#A0CBE2',width=4,edge_cmap=plt.cm.Blues,with_labels=True,labels=labels)
Пример #10
0
import os, gzip, cjson
from library.twitter import TweetFiles
from library.file_io import FileIO
from library.geo import isWithinBoundingBox
from settings import us_boundary

checkinsFile = 'checkins/%s'

def tweetFilesIterator():
    bdeDataFolder = '/mnt/chevron/bde/Data/TweetData/GeoTweets/2011/%s/%s/'
    for month in range(2, 12):
        outputFile = checkinsFile%month
        for day in range(1, 32):
            tweetsDayFolder = bdeDataFolder%(month, day)
            if os.path.exists(tweetsDayFolder):
                for _, _, files in os.walk(tweetsDayFolder):
                    for file in files: yield outputFile, tweetsDayFolder+file

for outputFile, file in tweetFilesIterator():
    print 'Parsing: %s'%file
    for line in gzip.open(file, 'rb'):
        try:
            data = cjson.decode(line)
            if 'geo' in data and data['geo']!=None:
                if isWithinBoundingBox(data['geo']['coordinates'], us_boundary):
                    checkin = {'geo': data['geo']['coordinates'], 'user': {'id': data['user']['id'], 'l': data['user']['location']}, 'id': data['id'], 't': data['created_at'], 'h': [], 'tx': data['text']}
                    for h in data['entities']['hashtags']: checkin['h'].append(h['text'])
#                    print checkin
                    FileIO.writeToFileAsJson(checkin, outputFile)
        except Exception as e: print e
Пример #11
0
def locationsForUsIterator(minUniqueUsersCheckedInTheLocation): 
    return (data['location'] for data in FileIO.iterateJsonFromFile(locationByUserDistributionFile) if data['count']>=minUniqueUsersCheckedInTheLocation and isWithinBoundingBox(getLocationFromLid(data['location']), us_boundary))
Пример #12
0
def latticeIdInValidAreas(latticeId):
    point = getLocationFromLid(latticeId.replace('_', ' '))
    for boundary in BOUNDARIES:
        if isWithinBoundingBox(point, boundary): return True
Пример #13
0
 def filter_latticeObjectsByBoundaryOny(self, key, values):
     latticeObject = list(values)[0]
     if isWithinBoundingBox(getLocationFromLid(latticeObject['llid'].replace('_', ' ')), BOUNDARY): 
         yield key, latticeObject
Пример #14
0
 def filter_latticeObjects(self, key, values):
     latticeObject = list(values)[0]
     total = len(latticeObject['c'])
     if total>=MINIMUM_NO_OF_CHECKINS_PER_LOCATION and \
         isWithinBoundingBox(getLocationFromLid(latticeObject['llid'].replace('_', ' ')), BOUNDARY): 
         yield key, latticeObject
 def get_valid_location((location, mf_model_id_to_hashtags)):
     location = getLocationFromLid(location.replace('_', ' '))
     return isWithinBoundingBox(location, US_BOUNDARY)