Exemplo n.º 1
0
 def PreprocessingFile(files):
     newfilename=(files.split('.txt.gz'))[0]+'.txt'
     newfile=open(newfilename,'w')
     for tweet in TweetFiles.iterateTweetsFromGzip(files):
         try:
             if tweet['corrdinate']!= None:
                 tempLoc1='%d'% tweet['corrdinate'][0]
                 tempLoc2='%d'% tweet['corrdinate'][1]
                 #print 'aaaaaa'
             #print tempLoc1,tempLoc2
             if int(tempLoc1)>=30 and int(tempLoc1)<50 and int(tempLoc2)<-75 and int(tempLoc2)>-125:
                 if isinstance(tweet['text'], unicode):   
                     tweetlist=tweet['text'].split()
                     for checkword in keyWords:
                         if checkword in tweetlist:
                             json.dump(tweet, newfile)
                 else:
                     tweetlist=re.findall(r'\w+', tweet['text'])
                     #print tweetlist
                     for checkword in keyWords:
                         if checkword in tweetlist:
                             print tweetlist
                             s=json.dumps(tweet)
                             newfile.write(s+"\n")
         except:pass
     newfile.close()
Exemplo n.º 2
0
    def PreprocessingFile(files):
    	filename=(files.split('reduced_tag_newyork/'))[1]
        tagdict={}
#	    print filename
#        newfilename= '/home/wei/new_geo_hashtag_data/'+filename +'.txt'
#        newfile=open(newfilename,'w')
        for tweet in TweetFiles.iterateTweetsFromFile(files): 
#            if tweet['h']:
#		print tweet['h']
#                temploc=[]
#                if 'geo' in tweet.keys():
#                    temploc=tweet['geo']
#                    loc=CheckLocation(tweet['geo'], 1, 30,50, -125, -75)
#                elif 'bb' in tweet.keys():
#                    temploc=tweet['bb']
                #print temploc
#                if temploc[0]>24.52 and temploc[0]<49.38 and temploc[1]<-66.95 and temploc[1]>-124.77:
#                    loc=CheckLocation(tweet['bb'], 1, 30,50, -125, -75)
            for tag in tweet['h']:
                if tag not in tagdict.keys():
                    tagdict[tag]=1
                else:tagdict[tag]+=1
#            s=json.dump(tweet, newfile)
#            newfile.write("\n")
        tagdictfile=open('./'+filename+'dict','w')
        print>>tagdictfile,tagdict
        return tagdict
Exemplo n.º 3
0
def BuildDataStructure(directory):
    CollectionOfHashtags={}
    LocationDict={}
    LocationMap={}
    locationNum=0# the number of loactions
#    listing = os.listdir(directory)
#    for file in listing:
#        print "current file reading is: " + file
#        path=directory+file
    n=0
    for tweet in TweetFiles.iterateTweetsFromFile(directory):
#        print tweet
        n=n+1
        if n%1000==0:
            print n
        if tweet['h']:
#                print tweet['h']
            if 'geo' in tweet.keys():
 #               print tweet['geo']
                loc=CheckLocation(tweet['geo'], 0.01,-74.25,-73.69,40.48,40.96)
            elif 'bb' in tweet.keys():
  #              print tweet['bb']
                loc=CheckLocation(tweet['bb'], 0.01,-74.25,-73.69,40.48,40.96)
            #else: pass
#            print loc
            loc=str(loc)
            uid=tweet['user']['id']
            if loc not in LocationDict.keys():                       
                LocationDict[loc]={}
                LocationDict[loc]['uids']=[]
            if uid not in LocationDict[loc]['uids']:
                LocationDict[loc]['uids'].append(uid)
 #          lockey=GetKeyFromValue(LocationMap,loc)          
            for tag in tweet['h']:
                if tag in CollectionOfHashtags.keys():
                    CollectionOfHashtags[tag]['totalCount']+=1
                    if uid not in CollectionOfHashtags[tag]['uids']:
                        CollectionOfHashtags[tag]['uids'].append(uid)
                else:
                    CollectionOfHashtags[tag]={}
                    CollectionOfHashtags[tag]['uids']=[]
                    CollectionOfHashtags[tag]['totalCount']=1
                    CollectionOfHashtags[tag]['fot']=tweet['t']
                    CollectionOfHashtags[tag]['uids'].append(uid)              
#                   if loc not in LocationDict.keys():                    
#                        LocationDict[loc]={}
#                        LocationDict[loc]['uids']=[]    
                    CollectionOfHashtags[tag]['fol']=loc
#                print tag
#                print LocationDict[loc].keys()
                if tag not in LocationDict[loc].keys():
                    LocationDict[loc][tag]={}
                    LocationDict[loc][tag]['localcount']=1
                    LocationDict[loc][tag]['fot']=tweet['t']
                else:
                    LocationDict[loc][tag]['localcount']+=1 
           # if lockey in LocationDict.keys():
    return (CollectionOfHashtags,LocationDict)                          
Exemplo n.º 4
0
def BuildDataStructure(directory):
    CollectionOfHashtags = {}
    LocationDict = {}
    LocationMap = {}
    locationNum = 0  # the number of loactions
    #    listing = os.listdir(directory)
    #    for file in listing:
    #        print "current file reading is: " + file
    #        path=directory+file
    n = 0
    for tweet in TweetFiles.iterateTweetsFromFile(directory):
        #        print tweet
        n = n + 1
        if n % 1000 == 0:
            print n
        if tweet["h"]:
            #                print tweet['h']
            if "geo" in tweet.keys():
                loc = CheckLocation(tweet["geo"], 1, -124.77, -66.95, 22.52, 49.38)
            elif "bb" in tweet.keys():
                loc = CheckLocation(tweet["bb"], 1, -124.77, -66.95, 22.52, 49.38)
            # else: pass
            for tag in tweet["h"]:
                if tag in CollectionOfHashtags.keys():
                    CollectionOfHashtags[tag]["totalCount"] += 1
                else:
                    CollectionOfHashtags[tag] = {}
                    CollectionOfHashtags[tag]["totalCount"] = 1
                    CollectionOfHashtags[tag]["fot"] = tweet["t"]

                    if loc not in LocationMap.values():  # convert location here
                        locationNum += 1
                        LocationMap[locationNum] = loc
                        LocationDict[locationNum] = {}
                    lockey = GetKeyFromValue(LocationMap, loc)
                    CollectionOfHashtags[tag]["fol"] = lockey

            # if lockey in LocationDict.keys():
            for tag in tweet["h"]:
                #                print tag
                if tag not in LocationDict[lockey].keys():
                    LocationDict[lockey][tag] = {}
                    LocationDict[lockey][tag]["localcount"] = 1
                    LocationDict[lockey][tag]["fot"] = tweet["t"]
                else:
                    LocationDict[lockey][tag]["localcount"] += 1
    return (CollectionOfHashtags, LocationDict, LocationMap)
Exemplo n.º 5
0
def BuildDataStructure(directory):
    CollectionOfHashtags = {}
    LocationDict = {}
    LocationMap = {}
    locationNum = 0  # the number of loactions
    listing = os.listdir(directory)
    for file in listing:
        print "current file reading is: " + file
        path = directory + file
        for tweet in TweetFiles.iterateTweetsFromFile(path):
            #            print tweet
            if tweet['h']:
                #                print tweet['h']
                if 'geo' in tweet.keys():
                    loc = CheckLocation(tweet['geo'], 0.01, -125, -75, 30, 50)
                elif 'bb' in tweet.keys():
                    loc = CheckLocation(tweet['bb'], 0.01, -125, -75, 30, 50)
                #else: pass
                for tag in tweet['h']:
                    if tag in CollectionOfHashtags.keys():
                        CollectionOfHashtags[tag]['totalCount'] += 1
                    else:
                        CollectionOfHashtags[tag] = {}
                        CollectionOfHashtags[tag]['totalCount'] = 1
                        CollectionOfHashtags[tag]['fot'] = tweet['t']

                        if loc not in LocationMap.values(
                        ):  #convert location here
                            locationNum += 1
                            LocationMap[locationNum] = loc
                            LocationDict[locationNum] = {}
                        lockey = GetKeyFromValue(LocationMap, loc)
                        CollectionOfHashtags[tag]['fol'] = lockey

            # if lockey in LocationDict.keys():
            # for tag in tweet['h']:
            # print tag
                    if tag not in LocationDict[lockey].keys():
                        LocationDict[lockey][tag] = {}
                        LocationDict[lockey][tag]['localcount'] = 1
                        LocationDict[lockey][tag]['fot'] = tweet['t']
                    else:
                        LocationDict[lockey][tag]['localcount'] += 1
    return (CollectionOfHashtags, LocationDict, LocationMap)
Exemplo n.º 6
0
def DocWordFreqDict(path):
    #---------------------- use each tweet as a doc. here may not be appropriate
    #---------------------------------------------------------------- initialize
    dict2 = {}
    docNum = 0
    for tweet in TweetFiles.iterateTweetsFromGzip(path):
        docNum += 1
        dict2[docNum] = {}
        #------------------------------------------------------- print a.split()
#-------------------------------------------------------- DocWordFreqDict(path1)
        for puretweet in re.findall(r'\w+', tweet['text']):
        #-------------------------- for puretweet in tweet['text'].split('\W+'):
            #---------------------------------------------print pure text tweets
            #------------------------------ if (dict2[docNum][puretweet]!=None):
            if(puretweet in dict2[docNum].keys()):
                dict2[docNum][puretweet] += 1  
            else:
                dict2[docNum][puretweet] = 1
    return dict2
Exemplo n.º 7
0
    def PreprocessingFile(files):
	filename=(files.split('reduced_tag_newyork/'))[1]
#	print filename
        newfilename= '/home/wei/ideamap/reduced_tag_newyork/'+filename+'2block'
        newfile=open(newfilename,'w')
        for tweet in TweetFiles.iterateTweetsFromFile(files):	    
#            if tweet['h']:
#            print tweet['h']
            temploc=[]
            if 'geo' in tweet.keys():
                temploc=tweet['geo']
#                    loc=CheckLocation(tweet['geo'], 1, 30,50, -125, -75)
            elif 'bb' in tweet.keys():
                temploc=tweet['bb']
#            print temploc
            if BoundingNewYork(temploc)==1:
#                    loc=CheckLocation(tweet['bb'], 1, 30,50, -125, -75)
                s=json.dump(tweet, newfile)
#                print tweet
                newfile.write("\n")
Exemplo n.º 8
0
    def PreprocessingFile(files):
	filename=(files.split('reduced_geo/'))[1]
#	print filename
        newfilename= '/home/wei/geo_hashtag_data/'+filename +'.txt1'
        newfile=open(newfilename,'w')
        for tweet in TweetFiles.iterateTweetsFromFile(files):
	    
            if tweet['h']:
#		print tweet['h']
                temploc=[]
                if 'geo' in tweet.keys():
                    temploc=tweet['geo']
#                    loc=CheckLocation(tweet['geo'], 1, 30,50, -125, -75)
                elif 'bb' in tweet.keys():
                    temploc=tweet['bb']
                #print temploc
               # if temploc[0]>24 and temploc[0]<49.38 and temploc[1]<-66.95 and temploc[1]>-124.77:
                if BoundingUS(temploc)==1:
#                    loc=CheckLocation(tweet['bb'], 1, 30,50, -125, -75)
                    s=json.dump(tweet, newfile)
                    newfile.write("\n")
Exemplo n.º 9
0
    def PreprocessingFile(files,tagdict):
	filename=(files.split('reduced_geo/'))[1]
#	print filename
        newfilename= '/home/wei/new_geo_hashtag_data/'+filename +'.txt'
        newfile=open(newfilename,'w')
        for tweet in TweetFiles.iterateTweetsFromFile(files):
	    
            if tweet['h']:
#		print tweet['h']
                temploc=[]
                if 'geo' in tweet.keys():
                    temploc=tweet['geo']
#                    loc=CheckLocation(tweet['geo'], 1, 30,50, -125, -75)
                elif 'bb' in tweet.keys():
                    temploc=tweet['bb']
                #print temploc
                if temploc[0]>24.52 and temploc[0]<49.38 and temploc[1]<-66.95 and temploc[1]>-124.77:
#                    loc=CheckLocation(tweet['bb'], 1, 30,50, -125, -75)
                    for tag in tweet['h']:
                        if tag not in tagdict.keys():
                            tagdict[tag]=1
                        else:tagdict[tag]+=1
                    s=json.dump(tweet, newfile)
                    newfile.write("\n")
Exemplo n.º 10
0
from twitter import TweetFiles
import os
import json
dir="/home/wei/new_geo_hashtag_data/2011_3.txt"
dictpath= "/home/wei/ideamap/2011_3dict"
path="/home/wei/new_geo_hashtag_data/reduced100000_2011_3"
f=open(dictpath,'r')

path1=""
f1=open(path,'w')
tagdict=f.readline()
tagdict=eval(tagdict)
tagdict1={}
for tag in tagdict.keys():
#    print tag,tagdict[tag]
    if tagdict[tag]>=100000:
        tagdict1[tag]=tagdict[tag]
        print tag,tagdict[tag]


for tweet in TweetFiles.iterateTweetsFromFile(dir):
    for tag in tweet['h']:
        if tag in tagdict1.keys():
            json.dump(tweet,f1)
            f1.write('\n')
            break
Exemplo n.º 11
0
def DocidWordFreqTimeLocationDictFromDir(directory):
    #---------------------- use each tweet as a doc. here may not be appropriate
    #---------------------------------------------------------------- initialize
    collectionWordDict = {}#word:frequency
    dict2 = {}
    timeLocationStamps = {}# docID:"time+location"
    timeLocationStampToDocIdDict = {}#time+location:[DocIds]
    timeToLocationDict = {}#time:[locations]
    locationToTimeDict = {}#location:[times]
    docId = 0
    listing = os.listdir(directory)
    for path in listing:
        print "current file is: " + path
        path = directory + path
        for tweet in TweetFiles.iterateTweetsFromGzip(path):
            try:
                docId += 1
                dict2[docId] = {}       
                #get time stamp
                dict2[docId]['docWordCount'] = 0
                tempTime = []
                tempTime=re.split(r' ',tweet['time'])
                dict2[docId]['time'] = {}
                dict2[docId]['time']['year'] = int(tempTime[5])
                #dict2[docId]['time']['month']={}#seems this is useless
                year='%04d' % int(tempTime[5])
                tempMonth = tempTime[1]
                dict2[docId]['time']['month'] = monthDict[tempMonth]
                month='%02d' % monthDict[tempMonth]
                dict2[docId]['time']['day'] = int(tempTime[2])
                day='%02d' % int(tempTime[2])
                dict2[docId]['time']['hour'] = int(re.split(r':', tempTime[3])[0])
                hour='%02d' % int(re.split(r':', tempTime[3])[0])
                dict2[docId]['time']['minute'] = int(re.split(r':', tempTime[3])[1])
                if dict2[docId]['time']['minute'] < 10:
                    minuteStamp = 0
                elif dict2[docId]['time']['minute'] >= 10 and dict2[docId]['time']['minute'] < 20:
                    minuteStamp = 1
                elif dict2[docId]['time']['minute'] >= 20 and dict2[docId]['time']['minute'] < 30:
                    minuteStamp = 2
                elif dict2[docId]['time']['minute'] >= 30 and dict2[docId]['time']['minute'] < 40:
                    minuteStamp = 3
                elif dict2[docId]['time']['minute'] >= 40 and dict2[docId]['time']['minute'] < 50:
                    minuteStamp = 4
                else:
                    minuteStamp = 5                     
#                print dict2[docId]['time']
#                get location stamp
#                if tweet['coordinates'] != None:
#                    tempLoc1 = '%.2f' % tweet['coordinates']['coordinates'][0]
#                    tempLoc2 = '%.2f' % tweet['coordinates']['coordinates'][1]
                if tweet['corrdinate']!= None:
                    tempLoc1='%d'% tweet['corrdinate'][0]
                    tempLoc2='%d'% tweet['corrdinate'][1]
#                elif tweet['place']['bounding_box'] != None:
#                    tempLocList = []
#                    tempLocList = tweet['place']['bounding_box']['coordinates'][0]
#                    #print tempLocList[0][0]
#                    #print tempLocList        
#                    tempLoc1 = '%.2f' % ((tempLocList[0][0] + tempLocList[1][0] + tempLocList[2][0] + tempLocList[3][0]) / 4)
#                    tempLoc2 = '%.2f' % ((tempLocList[0][1] + tempLocList[1][1] + tempLocList[2][1] + tempLocList[3][1]) / 4)
                else:
                    print "no location info"
                    tempLoc1 = '0'
                    tempLoc2 = '0'
                dict2[docId]['location'] = []
                dict2[docId]['location'].insert(0, tempLoc1)
                dict2[docId]['location'].insert(1, tempLoc2)
                timeStamp = year+month+day+hour+ str(minuteStamp)
                locationStamp = tempLoc1 + tempLoc2
                timeLocationStamps[docId] = (timeStamp + ',' + locationStamp)    
                #print timeLocationStamps
                #creat the other three dictionary for later use
                if timeLocationStamps[docId] not in timeLocationStampToDocIdDict.keys():
                    timeLocationStampToDocIdDict[timeLocationStamps[docId]] = []
                timeLocationStampToDocIdDict[timeLocationStamps[docId]].append(docId)    
                if timeStamp not in timeToLocationDict.keys():
                    timeToLocationDict[timeStamp] = []
                timeToLocationDict[timeStamp].append(locationStamp)               
                if locationStamp not in locationToTimeDict.keys():
                    locationToTimeDict[locationStamp] = []
                locationToTimeDict[locationStamp].append(timeStamp)               
                #get words
                dict2[docId]['words'] = {}
        #-------------------------------------------------------- DocWordFreqDict(path1)
    #            if tweet['text']==None:
    #                docId-=1
    #            if re.findall(r'\w+', tweet['text'])==None:
    #                docId-=1
                
                #for puretweet in tweet['text'].split():
                #    dict2[docId]['docWordCount'] += 1 
                #    if(puretweet in dict2[docId]['words'].keys()):
                #        dict2[docId]['words'][puretweet] += 1 
                #    else:
                #        dict2[docId]['words'][puretweet] = 1
                #    if(puretweet in collectionWordDict.keys()):
                #        collectionWordDict[puretweet] += 1
                #    else:
                #        collectionWordDict[puretweet] = 1
                # 
                for puretweet in re.findall(r'\w+', tweet['text']):
                    
                        
                    #-------------------------- for puretweet in tweet['text'].split('\W+'):
                        #---------------------------------------------print pure text tweets
                        #------------------------------ if (dict2[docNum][puretweet]!=None):
                        #puretweet=str(puretweet)
                    dict2[docId]['docWordCount'] += 1 
                    if(puretweet in dict2[docId]['words'].keys()):
                        dict2[docId]['words'][puretweet] += 1 
                    else:
                        dict2[docId]['words'][puretweet] = 1
                    if(puretweet in collectionWordDict.keys()):
                        collectionWordDict[puretweet] += 1
                    else:
                        collectionWordDict[puretweet] = 1
            except: pass
#    print collectionWordDict
#    print dict2
    return (collectionWordDict, dict2, timeLocationStamps, timeLocationStampToDocIdDict, timeToLocationDict, locationToTimeDict)
Exemplo n.º 12
0
def DocidWordFreqTimeLocationDictFromDir(directory, keywords):
    #---------------------- use each tweet as a doc. here may not be appropriate
    #---------------------------------------------------------------- initialize
    collectionWordDict = {}#word:frequency
    dict2 = {}
    timeStamps = {}# docID:"time+location"
    timeStampToDocIdDict = {}#time+location:[DocIds]
    #timeToLocationDict = {}#time:[locations]
    #locationToTimeDict = {}#location:[times]
    docId = 0
    listing = os.listdir(directory)
    for path in listing:
        print "current file is: " + path
        path = directory + path
        for tweet in TweetFiles.iterateTweetsFromGzip(path):
            if 1:
            #try:
                docId = tweet['id'];
                dict2[docId] = {};

                #get words
                dict2[docId]['words'] = {}
                dict2[docId]['docWordCount'] = 0
                
                simi_flag = 0;
                for token in keywords:
                #for puretweet in re.findall(r'\w+', tweet['text']):    
                    #-------------------------- for puretweet in tweet['text'].split('\W+'):
                    #---------------------------------------------print pure text tweets
                    #------------------------------ if (dict2[docNum][puretweet]!=None):
                    #puretweet=str(puretweet)
                    if similarityCal(token, tweet['text']) <= 0:
                        continue

                    simi_flag = 1;
                    dict2[docId]['docWordCount'] += 1 
                    if(token in dict2[docId]['words'].keys()):
                        dict2[docId]['words'][token] += 1 
                    else:
                        dict2[docId]['words'][token] = 1
                    if(token in collectionWordDict.keys()):
                        collectionWordDict[token] += 1
                    else:
                        collectionWordDict[token] = 1

                if simi_flag <= 0:
                    del dict2[docId]
                    continue;

                #docId += 1
                #dict2[docId] = {}       
                
                #get time stamp
                tempTime = []
                tempTime=re.split(r' ',tweet['time'])
                dict2[docId]['time'] = {}
                dict2[docId]['time']['year'] = int(tempTime[5])
                #dict2[docId]['time']['month']={}#seems this is useless
                year='%04d' % int(tempTime[5])
                tempMonth = tempTime[1]
                dict2[docId]['time']['month'] = monthDict[tempMonth]
                month='%02d' % monthDict[tempMonth]
                dict2[docId]['time']['day'] = int(tempTime[2])
                day='%02d' % int(tempTime[2])
                dict2[docId]['time']['hour'] = int(re.split(r':', tempTime[3])[0])
                hour='%02d' % int(re.split(r':', tempTime[3])[0])
                dict2[docId]['time']['minute'] = int(re.split(r':', tempTime[3])[1])
                if dict2[docId]['time']['minute'] < 30:
                    minuteStamp = 0  
                else:
                    minuteStamp = 1                     
                timeStamp = year+month+day
                #locationStamp = tempLoc1 + tempLoc2
                timeStamps[docId] = (timeStamp)    
                #print timeLocationStamps
                #creat the other three dictionary for later use
                if timeStamps[docId] not in timeStampToDocIdDict.keys():
                    timeStampToDocIdDict[timeStamps[docId]] = []
                timeStampToDocIdDict[timeStamps[docId]].append(docId)    
            #except: 
            #    print 'exception!'
            #    pass

#    print collectionWordDict
#    print dict2
    
#    outfile = file('dictData.txt', 'w');
#    json.dump(collectionWordDict, outfile);
#    outfile.write('\n');
#    json.dump(dict2, outfile);
#    outfile.write('\n');
#    json.dump(timeStamps, outfile);
#    outfile.write('\n');
#    json.dump(timeStampToDocIdDict, outfile);
#    outfile.write('\n');
#    outfile.close();

    return (collectionWordDict, dict2, timeStamps, timeStampToDocIdDict)
Exemplo n.º 13
0
def UserIdNameDict(path):
    dict1 = {}
    for tweet in TweetFiles.iterateTweetsFromGzip(path):
        dict1[tweet['user']['id']] = tweet['user']['name']
    return dict1
Exemplo n.º 14
0
def ReadFile(path):
    for tweet in TweetFiles.iterateTweetsFromGzip(path):
        print tweet