def PreprocessingFile(files): filename=(files.split('reduced_tag_newyork/'))[1] tagdict={} # print filename # newfilename= '/home/wei/new_geo_hashtag_data/'+filename +'.txt' # newfile=open(newfilename,'w') for tweet in TweetFiles.iterateTweetsFromFile(files): # if tweet['h']: # print tweet['h'] # temploc=[] # if 'geo' in tweet.keys(): # temploc=tweet['geo'] # loc=CheckLocation(tweet['geo'], 1, 30,50, -125, -75) # elif 'bb' in tweet.keys(): # temploc=tweet['bb'] #print temploc # if temploc[0]>24.52 and temploc[0]<49.38 and temploc[1]<-66.95 and temploc[1]>-124.77: # loc=CheckLocation(tweet['bb'], 1, 30,50, -125, -75) for tag in tweet['h']: if tag not in tagdict.keys(): tagdict[tag]=1 else:tagdict[tag]+=1 # s=json.dump(tweet, newfile) # newfile.write("\n") tagdictfile=open('./'+filename+'dict','w') print>>tagdictfile,tagdict return tagdict
def BuildDataStructure(directory): CollectionOfHashtags={} LocationDict={} LocationMap={} locationNum=0# the number of loactions # listing = os.listdir(directory) # for file in listing: # print "current file reading is: " + file # path=directory+file n=0 for tweet in TweetFiles.iterateTweetsFromFile(directory): # print tweet n=n+1 if n%1000==0: print n if tweet['h']: # print tweet['h'] if 'geo' in tweet.keys(): # print tweet['geo'] loc=CheckLocation(tweet['geo'], 0.01,-74.25,-73.69,40.48,40.96) elif 'bb' in tweet.keys(): # print tweet['bb'] loc=CheckLocation(tweet['bb'], 0.01,-74.25,-73.69,40.48,40.96) #else: pass # print loc loc=str(loc) uid=tweet['user']['id'] if loc not in LocationDict.keys(): LocationDict[loc]={} LocationDict[loc]['uids']=[] if uid not in LocationDict[loc]['uids']: LocationDict[loc]['uids'].append(uid) # lockey=GetKeyFromValue(LocationMap,loc) for tag in tweet['h']: if tag in CollectionOfHashtags.keys(): CollectionOfHashtags[tag]['totalCount']+=1 if uid not in CollectionOfHashtags[tag]['uids']: CollectionOfHashtags[tag]['uids'].append(uid) else: CollectionOfHashtags[tag]={} CollectionOfHashtags[tag]['uids']=[] CollectionOfHashtags[tag]['totalCount']=1 CollectionOfHashtags[tag]['fot']=tweet['t'] CollectionOfHashtags[tag]['uids'].append(uid) # if loc not in LocationDict.keys(): # LocationDict[loc]={} # LocationDict[loc]['uids']=[] CollectionOfHashtags[tag]['fol']=loc # print tag # print LocationDict[loc].keys() if tag not in LocationDict[loc].keys(): LocationDict[loc][tag]={} LocationDict[loc][tag]['localcount']=1 LocationDict[loc][tag]['fot']=tweet['t'] else: LocationDict[loc][tag]['localcount']+=1 # if lockey in LocationDict.keys(): return (CollectionOfHashtags,LocationDict)
def BuildDataStructure(directory): CollectionOfHashtags = {} LocationDict = {} LocationMap = {} locationNum = 0 # the number of loactions # listing = os.listdir(directory) # for file in listing: # print "current file reading is: " + file # path=directory+file n = 0 for tweet in TweetFiles.iterateTweetsFromFile(directory): # print tweet n = n + 1 if n % 1000 == 0: print n if tweet["h"]: # print tweet['h'] if "geo" in tweet.keys(): loc = CheckLocation(tweet["geo"], 1, -124.77, -66.95, 22.52, 49.38) elif "bb" in tweet.keys(): loc = CheckLocation(tweet["bb"], 1, -124.77, -66.95, 22.52, 49.38) # else: pass for tag in tweet["h"]: if tag in CollectionOfHashtags.keys(): CollectionOfHashtags[tag]["totalCount"] += 1 else: CollectionOfHashtags[tag] = {} CollectionOfHashtags[tag]["totalCount"] = 1 CollectionOfHashtags[tag]["fot"] = tweet["t"] if loc not in LocationMap.values(): # convert location here locationNum += 1 LocationMap[locationNum] = loc LocationDict[locationNum] = {} lockey = GetKeyFromValue(LocationMap, loc) CollectionOfHashtags[tag]["fol"] = lockey # if lockey in LocationDict.keys(): for tag in tweet["h"]: # print tag if tag not in LocationDict[lockey].keys(): LocationDict[lockey][tag] = {} LocationDict[lockey][tag]["localcount"] = 1 LocationDict[lockey][tag]["fot"] = tweet["t"] else: LocationDict[lockey][tag]["localcount"] += 1 return (CollectionOfHashtags, LocationDict, LocationMap)
def BuildDataStructure(directory): CollectionOfHashtags = {} LocationDict = {} LocationMap = {} locationNum = 0 # the number of loactions listing = os.listdir(directory) for file in listing: print "current file reading is: " + file path = directory + file for tweet in TweetFiles.iterateTweetsFromFile(path): # print tweet if tweet['h']: # print tweet['h'] if 'geo' in tweet.keys(): loc = CheckLocation(tweet['geo'], 0.01, -125, -75, 30, 50) elif 'bb' in tweet.keys(): loc = CheckLocation(tweet['bb'], 0.01, -125, -75, 30, 50) #else: pass for tag in tweet['h']: if tag in CollectionOfHashtags.keys(): CollectionOfHashtags[tag]['totalCount'] += 1 else: CollectionOfHashtags[tag] = {} CollectionOfHashtags[tag]['totalCount'] = 1 CollectionOfHashtags[tag]['fot'] = tweet['t'] if loc not in LocationMap.values( ): #convert location here locationNum += 1 LocationMap[locationNum] = loc LocationDict[locationNum] = {} lockey = GetKeyFromValue(LocationMap, loc) CollectionOfHashtags[tag]['fol'] = lockey # if lockey in LocationDict.keys(): # for tag in tweet['h']: # print tag if tag not in LocationDict[lockey].keys(): LocationDict[lockey][tag] = {} LocationDict[lockey][tag]['localcount'] = 1 LocationDict[lockey][tag]['fot'] = tweet['t'] else: LocationDict[lockey][tag]['localcount'] += 1 return (CollectionOfHashtags, LocationDict, LocationMap)
def PreprocessingFile(files): filename=(files.split('reduced_tag_newyork/'))[1] # print filename newfilename= '/home/wei/ideamap/reduced_tag_newyork/'+filename+'2block' newfile=open(newfilename,'w') for tweet in TweetFiles.iterateTweetsFromFile(files): # if tweet['h']: # print tweet['h'] temploc=[] if 'geo' in tweet.keys(): temploc=tweet['geo'] # loc=CheckLocation(tweet['geo'], 1, 30,50, -125, -75) elif 'bb' in tweet.keys(): temploc=tweet['bb'] # print temploc if BoundingNewYork(temploc)==1: # loc=CheckLocation(tweet['bb'], 1, 30,50, -125, -75) s=json.dump(tweet, newfile) # print tweet newfile.write("\n")
def PreprocessingFile(files): filename=(files.split('reduced_geo/'))[1] # print filename newfilename= '/home/wei/geo_hashtag_data/'+filename +'.txt1' newfile=open(newfilename,'w') for tweet in TweetFiles.iterateTweetsFromFile(files): if tweet['h']: # print tweet['h'] temploc=[] if 'geo' in tweet.keys(): temploc=tweet['geo'] # loc=CheckLocation(tweet['geo'], 1, 30,50, -125, -75) elif 'bb' in tweet.keys(): temploc=tweet['bb'] #print temploc # if temploc[0]>24 and temploc[0]<49.38 and temploc[1]<-66.95 and temploc[1]>-124.77: if BoundingUS(temploc)==1: # loc=CheckLocation(tweet['bb'], 1, 30,50, -125, -75) s=json.dump(tweet, newfile) newfile.write("\n")
def PreprocessingFile(files,tagdict): filename=(files.split('reduced_geo/'))[1] # print filename newfilename= '/home/wei/new_geo_hashtag_data/'+filename +'.txt' newfile=open(newfilename,'w') for tweet in TweetFiles.iterateTweetsFromFile(files): if tweet['h']: # print tweet['h'] temploc=[] if 'geo' in tweet.keys(): temploc=tweet['geo'] # loc=CheckLocation(tweet['geo'], 1, 30,50, -125, -75) elif 'bb' in tweet.keys(): temploc=tweet['bb'] #print temploc if temploc[0]>24.52 and temploc[0]<49.38 and temploc[1]<-66.95 and temploc[1]>-124.77: # loc=CheckLocation(tweet['bb'], 1, 30,50, -125, -75) for tag in tweet['h']: if tag not in tagdict.keys(): tagdict[tag]=1 else:tagdict[tag]+=1 s=json.dump(tweet, newfile) newfile.write("\n")
from twitter import TweetFiles import os import json dir="/home/wei/new_geo_hashtag_data/2011_3.txt" dictpath= "/home/wei/ideamap/2011_3dict" path="/home/wei/new_geo_hashtag_data/reduced100000_2011_3" f=open(dictpath,'r') path1="" f1=open(path,'w') tagdict=f.readline() tagdict=eval(tagdict) tagdict1={} for tag in tagdict.keys(): # print tag,tagdict[tag] if tagdict[tag]>=100000: tagdict1[tag]=tagdict[tag] print tag,tagdict[tag] for tweet in TweetFiles.iterateTweetsFromFile(dir): for tag in tweet['h']: if tag in tagdict1.keys(): json.dump(tweet,f1) f1.write('\n') break