예제 #1
0
    def PreprocessingFile(files):
    	filename=(files.split('reduced_tag_newyork/'))[1]
        tagdict={}
#	    print filename
#        newfilename= '/home/wei/new_geo_hashtag_data/'+filename +'.txt'
#        newfile=open(newfilename,'w')
        for tweet in TweetFiles.iterateTweetsFromFile(files): 
#            if tweet['h']:
#		print tweet['h']
#                temploc=[]
#                if 'geo' in tweet.keys():
#                    temploc=tweet['geo']
#                    loc=CheckLocation(tweet['geo'], 1, 30,50, -125, -75)
#                elif 'bb' in tweet.keys():
#                    temploc=tweet['bb']
                #print temploc
#                if temploc[0]>24.52 and temploc[0]<49.38 and temploc[1]<-66.95 and temploc[1]>-124.77:
#                    loc=CheckLocation(tweet['bb'], 1, 30,50, -125, -75)
            for tag in tweet['h']:
                if tag not in tagdict.keys():
                    tagdict[tag]=1
                else:tagdict[tag]+=1
#            s=json.dump(tweet, newfile)
#            newfile.write("\n")
        tagdictfile=open('./'+filename+'dict','w')
        print>>tagdictfile,tagdict
        return tagdict
예제 #2
0
def BuildDataStructure(directory):
    CollectionOfHashtags={}
    LocationDict={}
    LocationMap={}
    locationNum=0# the number of loactions
#    listing = os.listdir(directory)
#    for file in listing:
#        print "current file reading is: " + file
#        path=directory+file
    n=0
    for tweet in TweetFiles.iterateTweetsFromFile(directory):
#        print tweet
        n=n+1
        if n%1000==0:
            print n
        if tweet['h']:
#                print tweet['h']
            if 'geo' in tweet.keys():
 #               print tweet['geo']
                loc=CheckLocation(tweet['geo'], 0.01,-74.25,-73.69,40.48,40.96)
            elif 'bb' in tweet.keys():
  #              print tweet['bb']
                loc=CheckLocation(tweet['bb'], 0.01,-74.25,-73.69,40.48,40.96)
            #else: pass
#            print loc
            loc=str(loc)
            uid=tweet['user']['id']
            if loc not in LocationDict.keys():                       
                LocationDict[loc]={}
                LocationDict[loc]['uids']=[]
            if uid not in LocationDict[loc]['uids']:
                LocationDict[loc]['uids'].append(uid)
 #          lockey=GetKeyFromValue(LocationMap,loc)          
            for tag in tweet['h']:
                if tag in CollectionOfHashtags.keys():
                    CollectionOfHashtags[tag]['totalCount']+=1
                    if uid not in CollectionOfHashtags[tag]['uids']:
                        CollectionOfHashtags[tag]['uids'].append(uid)
                else:
                    CollectionOfHashtags[tag]={}
                    CollectionOfHashtags[tag]['uids']=[]
                    CollectionOfHashtags[tag]['totalCount']=1
                    CollectionOfHashtags[tag]['fot']=tweet['t']
                    CollectionOfHashtags[tag]['uids'].append(uid)              
#                   if loc not in LocationDict.keys():                    
#                        LocationDict[loc]={}
#                        LocationDict[loc]['uids']=[]    
                    CollectionOfHashtags[tag]['fol']=loc
#                print tag
#                print LocationDict[loc].keys()
                if tag not in LocationDict[loc].keys():
                    LocationDict[loc][tag]={}
                    LocationDict[loc][tag]['localcount']=1
                    LocationDict[loc][tag]['fot']=tweet['t']
                else:
                    LocationDict[loc][tag]['localcount']+=1 
           # if lockey in LocationDict.keys():
    return (CollectionOfHashtags,LocationDict)                          
예제 #3
0
def BuildDataStructure(directory):
    CollectionOfHashtags = {}
    LocationDict = {}
    LocationMap = {}
    locationNum = 0  # the number of loactions
    #    listing = os.listdir(directory)
    #    for file in listing:
    #        print "current file reading is: " + file
    #        path=directory+file
    n = 0
    for tweet in TweetFiles.iterateTweetsFromFile(directory):
        #        print tweet
        n = n + 1
        if n % 1000 == 0:
            print n
        if tweet["h"]:
            #                print tweet['h']
            if "geo" in tweet.keys():
                loc = CheckLocation(tweet["geo"], 1, -124.77, -66.95, 22.52, 49.38)
            elif "bb" in tweet.keys():
                loc = CheckLocation(tweet["bb"], 1, -124.77, -66.95, 22.52, 49.38)
            # else: pass
            for tag in tweet["h"]:
                if tag in CollectionOfHashtags.keys():
                    CollectionOfHashtags[tag]["totalCount"] += 1
                else:
                    CollectionOfHashtags[tag] = {}
                    CollectionOfHashtags[tag]["totalCount"] = 1
                    CollectionOfHashtags[tag]["fot"] = tweet["t"]

                    if loc not in LocationMap.values():  # convert location here
                        locationNum += 1
                        LocationMap[locationNum] = loc
                        LocationDict[locationNum] = {}
                    lockey = GetKeyFromValue(LocationMap, loc)
                    CollectionOfHashtags[tag]["fol"] = lockey

            # if lockey in LocationDict.keys():
            for tag in tweet["h"]:
                #                print tag
                if tag not in LocationDict[lockey].keys():
                    LocationDict[lockey][tag] = {}
                    LocationDict[lockey][tag]["localcount"] = 1
                    LocationDict[lockey][tag]["fot"] = tweet["t"]
                else:
                    LocationDict[lockey][tag]["localcount"] += 1
    return (CollectionOfHashtags, LocationDict, LocationMap)
예제 #4
0
def BuildDataStructure(directory):
    CollectionOfHashtags = {}
    LocationDict = {}
    LocationMap = {}
    locationNum = 0  # the number of loactions
    listing = os.listdir(directory)
    for file in listing:
        print "current file reading is: " + file
        path = directory + file
        for tweet in TweetFiles.iterateTweetsFromFile(path):
            #            print tweet
            if tweet['h']:
                #                print tweet['h']
                if 'geo' in tweet.keys():
                    loc = CheckLocation(tweet['geo'], 0.01, -125, -75, 30, 50)
                elif 'bb' in tweet.keys():
                    loc = CheckLocation(tweet['bb'], 0.01, -125, -75, 30, 50)
                #else: pass
                for tag in tweet['h']:
                    if tag in CollectionOfHashtags.keys():
                        CollectionOfHashtags[tag]['totalCount'] += 1
                    else:
                        CollectionOfHashtags[tag] = {}
                        CollectionOfHashtags[tag]['totalCount'] = 1
                        CollectionOfHashtags[tag]['fot'] = tweet['t']

                        if loc not in LocationMap.values(
                        ):  #convert location here
                            locationNum += 1
                            LocationMap[locationNum] = loc
                            LocationDict[locationNum] = {}
                        lockey = GetKeyFromValue(LocationMap, loc)
                        CollectionOfHashtags[tag]['fol'] = lockey

            # if lockey in LocationDict.keys():
            # for tag in tweet['h']:
            # print tag
                    if tag not in LocationDict[lockey].keys():
                        LocationDict[lockey][tag] = {}
                        LocationDict[lockey][tag]['localcount'] = 1
                        LocationDict[lockey][tag]['fot'] = tweet['t']
                    else:
                        LocationDict[lockey][tag]['localcount'] += 1
    return (CollectionOfHashtags, LocationDict, LocationMap)
예제 #5
0
    def PreprocessingFile(files):
	filename=(files.split('reduced_tag_newyork/'))[1]
#	print filename
        newfilename= '/home/wei/ideamap/reduced_tag_newyork/'+filename+'2block'
        newfile=open(newfilename,'w')
        for tweet in TweetFiles.iterateTweetsFromFile(files):	    
#            if tweet['h']:
#            print tweet['h']
            temploc=[]
            if 'geo' in tweet.keys():
                temploc=tweet['geo']
#                    loc=CheckLocation(tweet['geo'], 1, 30,50, -125, -75)
            elif 'bb' in tweet.keys():
                temploc=tweet['bb']
#            print temploc
            if BoundingNewYork(temploc)==1:
#                    loc=CheckLocation(tweet['bb'], 1, 30,50, -125, -75)
                s=json.dump(tweet, newfile)
#                print tweet
                newfile.write("\n")
예제 #6
0
    def PreprocessingFile(files):
	filename=(files.split('reduced_geo/'))[1]
#	print filename
        newfilename= '/home/wei/geo_hashtag_data/'+filename +'.txt1'
        newfile=open(newfilename,'w')
        for tweet in TweetFiles.iterateTweetsFromFile(files):
	    
            if tweet['h']:
#		print tweet['h']
                temploc=[]
                if 'geo' in tweet.keys():
                    temploc=tweet['geo']
#                    loc=CheckLocation(tweet['geo'], 1, 30,50, -125, -75)
                elif 'bb' in tweet.keys():
                    temploc=tweet['bb']
                #print temploc
               # if temploc[0]>24 and temploc[0]<49.38 and temploc[1]<-66.95 and temploc[1]>-124.77:
                if BoundingUS(temploc)==1:
#                    loc=CheckLocation(tweet['bb'], 1, 30,50, -125, -75)
                    s=json.dump(tweet, newfile)
                    newfile.write("\n")
예제 #7
0
    def PreprocessingFile(files,tagdict):
	filename=(files.split('reduced_geo/'))[1]
#	print filename
        newfilename= '/home/wei/new_geo_hashtag_data/'+filename +'.txt'
        newfile=open(newfilename,'w')
        for tweet in TweetFiles.iterateTweetsFromFile(files):
	    
            if tweet['h']:
#		print tweet['h']
                temploc=[]
                if 'geo' in tweet.keys():
                    temploc=tweet['geo']
#                    loc=CheckLocation(tweet['geo'], 1, 30,50, -125, -75)
                elif 'bb' in tweet.keys():
                    temploc=tweet['bb']
                #print temploc
                if temploc[0]>24.52 and temploc[0]<49.38 and temploc[1]<-66.95 and temploc[1]>-124.77:
#                    loc=CheckLocation(tweet['bb'], 1, 30,50, -125, -75)
                    for tag in tweet['h']:
                        if tag not in tagdict.keys():
                            tagdict[tag]=1
                        else:tagdict[tag]+=1
                    s=json.dump(tweet, newfile)
                    newfile.write("\n")
예제 #8
0
from twitter import TweetFiles
import os
import json
dir="/home/wei/new_geo_hashtag_data/2011_3.txt"
dictpath= "/home/wei/ideamap/2011_3dict"
path="/home/wei/new_geo_hashtag_data/reduced100000_2011_3"
f=open(dictpath,'r')

path1=""
f1=open(path,'w')
tagdict=f.readline()
tagdict=eval(tagdict)
tagdict1={}
for tag in tagdict.keys():
#    print tag,tagdict[tag]
    if tagdict[tag]>=100000:
        tagdict1[tag]=tagdict[tag]
        print tag,tagdict[tag]


for tweet in TweetFiles.iterateTweetsFromFile(dir):
    for tag in tweet['h']:
        if tag in tagdict1.keys():
            json.dump(tweet,f1)
            f1.write('\n')
            break