Exemplo n.º 1
0
 def PreprocessingFile(files):
     newfilename=(files.split('.txt.gz'))[0]+'.txt'
     newfile=open(newfilename,'w')
     for tweet in TweetFiles.iterateTweetsFromGzip(files):
         try:
             if tweet['corrdinate']!= None:
                 tempLoc1='%d'% tweet['corrdinate'][0]
                 tempLoc2='%d'% tweet['corrdinate'][1]
                 #print 'aaaaaa'
             #print tempLoc1,tempLoc2
             if int(tempLoc1)>=30 and int(tempLoc1)<50 and int(tempLoc2)<-75 and int(tempLoc2)>-125:
                 if isinstance(tweet['text'], unicode):   
                     tweetlist=tweet['text'].split()
                     for checkword in keyWords:
                         if checkword in tweetlist:
                             json.dump(tweet, newfile)
                 else:
                     tweetlist=re.findall(r'\w+', tweet['text'])
                     #print tweetlist
                     for checkword in keyWords:
                         if checkword in tweetlist:
                             print tweetlist
                             s=json.dumps(tweet)
                             newfile.write(s+"\n")
         except:pass
     newfile.close()
Exemplo n.º 2
0
def DocWordFreqDict(path):
    #---------------------- use each tweet as a doc. here may not be appropriate
    #---------------------------------------------------------------- initialize
    dict2 = {}
    docNum = 0
    for tweet in TweetFiles.iterateTweetsFromGzip(path):
        docNum += 1
        dict2[docNum] = {}
        #------------------------------------------------------- print a.split()
#-------------------------------------------------------- DocWordFreqDict(path1)
        for puretweet in re.findall(r'\w+', tweet['text']):
        #-------------------------- for puretweet in tweet['text'].split('\W+'):
            #---------------------------------------------print pure text tweets
            #------------------------------ if (dict2[docNum][puretweet]!=None):
            if(puretweet in dict2[docNum].keys()):
                dict2[docNum][puretweet] += 1  
            else:
                dict2[docNum][puretweet] = 1
    return dict2
Exemplo n.º 3
0
def DocidWordFreqTimeLocationDictFromDir(directory):
    #---------------------- use each tweet as a doc. here may not be appropriate
    #---------------------------------------------------------------- initialize
    collectionWordDict = {}#word:frequency
    dict2 = {}
    timeLocationStamps = {}# docID:"time+location"
    timeLocationStampToDocIdDict = {}#time+location:[DocIds]
    timeToLocationDict = {}#time:[locations]
    locationToTimeDict = {}#location:[times]
    docId = 0
    listing = os.listdir(directory)
    for path in listing:
        print "current file is: " + path
        path = directory + path
        for tweet in TweetFiles.iterateTweetsFromGzip(path):
            try:
                docId += 1
                dict2[docId] = {}       
                #get time stamp
                dict2[docId]['docWordCount'] = 0
                tempTime = []
                tempTime=re.split(r' ',tweet['time'])
                dict2[docId]['time'] = {}
                dict2[docId]['time']['year'] = int(tempTime[5])
                #dict2[docId]['time']['month']={}#seems this is useless
                year='%04d' % int(tempTime[5])
                tempMonth = tempTime[1]
                dict2[docId]['time']['month'] = monthDict[tempMonth]
                month='%02d' % monthDict[tempMonth]
                dict2[docId]['time']['day'] = int(tempTime[2])
                day='%02d' % int(tempTime[2])
                dict2[docId]['time']['hour'] = int(re.split(r':', tempTime[3])[0])
                hour='%02d' % int(re.split(r':', tempTime[3])[0])
                dict2[docId]['time']['minute'] = int(re.split(r':', tempTime[3])[1])
                if dict2[docId]['time']['minute'] < 10:
                    minuteStamp = 0
                elif dict2[docId]['time']['minute'] >= 10 and dict2[docId]['time']['minute'] < 20:
                    minuteStamp = 1
                elif dict2[docId]['time']['minute'] >= 20 and dict2[docId]['time']['minute'] < 30:
                    minuteStamp = 2
                elif dict2[docId]['time']['minute'] >= 30 and dict2[docId]['time']['minute'] < 40:
                    minuteStamp = 3
                elif dict2[docId]['time']['minute'] >= 40 and dict2[docId]['time']['minute'] < 50:
                    minuteStamp = 4
                else:
                    minuteStamp = 5                     
#                print dict2[docId]['time']
#                get location stamp
#                if tweet['coordinates'] != None:
#                    tempLoc1 = '%.2f' % tweet['coordinates']['coordinates'][0]
#                    tempLoc2 = '%.2f' % tweet['coordinates']['coordinates'][1]
                if tweet['corrdinate']!= None:
                    tempLoc1='%d'% tweet['corrdinate'][0]
                    tempLoc2='%d'% tweet['corrdinate'][1]
#                elif tweet['place']['bounding_box'] != None:
#                    tempLocList = []
#                    tempLocList = tweet['place']['bounding_box']['coordinates'][0]
#                    #print tempLocList[0][0]
#                    #print tempLocList        
#                    tempLoc1 = '%.2f' % ((tempLocList[0][0] + tempLocList[1][0] + tempLocList[2][0] + tempLocList[3][0]) / 4)
#                    tempLoc2 = '%.2f' % ((tempLocList[0][1] + tempLocList[1][1] + tempLocList[2][1] + tempLocList[3][1]) / 4)
                else:
                    print "no location info"
                    tempLoc1 = '0'
                    tempLoc2 = '0'
                dict2[docId]['location'] = []
                dict2[docId]['location'].insert(0, tempLoc1)
                dict2[docId]['location'].insert(1, tempLoc2)
                timeStamp = year+month+day+hour+ str(minuteStamp)
                locationStamp = tempLoc1 + tempLoc2
                timeLocationStamps[docId] = (timeStamp + ',' + locationStamp)    
                #print timeLocationStamps
                #creat the other three dictionary for later use
                if timeLocationStamps[docId] not in timeLocationStampToDocIdDict.keys():
                    timeLocationStampToDocIdDict[timeLocationStamps[docId]] = []
                timeLocationStampToDocIdDict[timeLocationStamps[docId]].append(docId)    
                if timeStamp not in timeToLocationDict.keys():
                    timeToLocationDict[timeStamp] = []
                timeToLocationDict[timeStamp].append(locationStamp)               
                if locationStamp not in locationToTimeDict.keys():
                    locationToTimeDict[locationStamp] = []
                locationToTimeDict[locationStamp].append(timeStamp)               
                #get words
                dict2[docId]['words'] = {}
        #-------------------------------------------------------- DocWordFreqDict(path1)
    #            if tweet['text']==None:
    #                docId-=1
    #            if re.findall(r'\w+', tweet['text'])==None:
    #                docId-=1
                
                #for puretweet in tweet['text'].split():
                #    dict2[docId]['docWordCount'] += 1 
                #    if(puretweet in dict2[docId]['words'].keys()):
                #        dict2[docId]['words'][puretweet] += 1 
                #    else:
                #        dict2[docId]['words'][puretweet] = 1
                #    if(puretweet in collectionWordDict.keys()):
                #        collectionWordDict[puretweet] += 1
                #    else:
                #        collectionWordDict[puretweet] = 1
                # 
                for puretweet in re.findall(r'\w+', tweet['text']):
                    
                        
                    #-------------------------- for puretweet in tweet['text'].split('\W+'):
                        #---------------------------------------------print pure text tweets
                        #------------------------------ if (dict2[docNum][puretweet]!=None):
                        #puretweet=str(puretweet)
                    dict2[docId]['docWordCount'] += 1 
                    if(puretweet in dict2[docId]['words'].keys()):
                        dict2[docId]['words'][puretweet] += 1 
                    else:
                        dict2[docId]['words'][puretweet] = 1
                    if(puretweet in collectionWordDict.keys()):
                        collectionWordDict[puretweet] += 1
                    else:
                        collectionWordDict[puretweet] = 1
            except: pass
#    print collectionWordDict
#    print dict2
    return (collectionWordDict, dict2, timeLocationStamps, timeLocationStampToDocIdDict, timeToLocationDict, locationToTimeDict)
Exemplo n.º 4
0
def DocidWordFreqTimeLocationDictFromDir(directory, keywords):
    #---------------------- use each tweet as a doc. here may not be appropriate
    #---------------------------------------------------------------- initialize
    collectionWordDict = {}#word:frequency
    dict2 = {}
    timeStamps = {}# docID:"time+location"
    timeStampToDocIdDict = {}#time+location:[DocIds]
    #timeToLocationDict = {}#time:[locations]
    #locationToTimeDict = {}#location:[times]
    docId = 0
    listing = os.listdir(directory)
    for path in listing:
        print "current file is: " + path
        path = directory + path
        for tweet in TweetFiles.iterateTweetsFromGzip(path):
            if 1:
            #try:
                docId = tweet['id'];
                dict2[docId] = {};

                #get words
                dict2[docId]['words'] = {}
                dict2[docId]['docWordCount'] = 0
                
                simi_flag = 0;
                for token in keywords:
                #for puretweet in re.findall(r'\w+', tweet['text']):    
                    #-------------------------- for puretweet in tweet['text'].split('\W+'):
                    #---------------------------------------------print pure text tweets
                    #------------------------------ if (dict2[docNum][puretweet]!=None):
                    #puretweet=str(puretweet)
                    if similarityCal(token, tweet['text']) <= 0:
                        continue

                    simi_flag = 1;
                    dict2[docId]['docWordCount'] += 1 
                    if(token in dict2[docId]['words'].keys()):
                        dict2[docId]['words'][token] += 1 
                    else:
                        dict2[docId]['words'][token] = 1
                    if(token in collectionWordDict.keys()):
                        collectionWordDict[token] += 1
                    else:
                        collectionWordDict[token] = 1

                if simi_flag <= 0:
                    del dict2[docId]
                    continue;

                #docId += 1
                #dict2[docId] = {}       
                
                #get time stamp
                tempTime = []
                tempTime=re.split(r' ',tweet['time'])
                dict2[docId]['time'] = {}
                dict2[docId]['time']['year'] = int(tempTime[5])
                #dict2[docId]['time']['month']={}#seems this is useless
                year='%04d' % int(tempTime[5])
                tempMonth = tempTime[1]
                dict2[docId]['time']['month'] = monthDict[tempMonth]
                month='%02d' % monthDict[tempMonth]
                dict2[docId]['time']['day'] = int(tempTime[2])
                day='%02d' % int(tempTime[2])
                dict2[docId]['time']['hour'] = int(re.split(r':', tempTime[3])[0])
                hour='%02d' % int(re.split(r':', tempTime[3])[0])
                dict2[docId]['time']['minute'] = int(re.split(r':', tempTime[3])[1])
                if dict2[docId]['time']['minute'] < 30:
                    minuteStamp = 0  
                else:
                    minuteStamp = 1                     
                timeStamp = year+month+day
                #locationStamp = tempLoc1 + tempLoc2
                timeStamps[docId] = (timeStamp)    
                #print timeLocationStamps
                #creat the other three dictionary for later use
                if timeStamps[docId] not in timeStampToDocIdDict.keys():
                    timeStampToDocIdDict[timeStamps[docId]] = []
                timeStampToDocIdDict[timeStamps[docId]].append(docId)    
            #except: 
            #    print 'exception!'
            #    pass

#    print collectionWordDict
#    print dict2
    
#    outfile = file('dictData.txt', 'w');
#    json.dump(collectionWordDict, outfile);
#    outfile.write('\n');
#    json.dump(dict2, outfile);
#    outfile.write('\n');
#    json.dump(timeStamps, outfile);
#    outfile.write('\n');
#    json.dump(timeStampToDocIdDict, outfile);
#    outfile.write('\n');
#    outfile.close();

    return (collectionWordDict, dict2, timeStamps, timeStampToDocIdDict)
Exemplo n.º 5
0
def UserIdNameDict(path):
    dict1 = {}
    for tweet in TweetFiles.iterateTweetsFromGzip(path):
        dict1[tweet['user']['id']] = tweet['user']['name']
    return dict1
Exemplo n.º 6
0
def ReadFile(path):
    for tweet in TweetFiles.iterateTweetsFromGzip(path):
        print tweet