def retrieveTweets(self, ID, Q, geoCode): '''retrieveTweets from twitter and store the feeds into MongoDB ''' since_id = mongoInt.retrieveSinceID(ID) #since_id = long(785438635369738240) logger.debug('retrieve tweets') logger.debug(since_id) logger.debug('retrieve tweets123456') #fetch the latest since_id and pass it in next twitter call #since_id = mongoInt.retrieveSinceID(ID) twits = twitterInt.retrieveTweets(Q, geoCode, since_id) mongoInt.collectionFeedFrequency(len(twits), ID) def removeRetweets(tweet): if 'retweeted_status' in tweet: tweet = tweet['retweeted_status'] tweet['alreadyRetweeted'] = True print tweet['text'] #json_obj = json.dumps(obj) return tweet #map(lambda tw:tw.update({'created_time': timegm(time.gmtime(time.strptime(tw['created_at'],"%a %b %d %H:%M:%S +0000 %Y")))}),twits) value = map(removeRetweets, twits) twits = value map( lambda tw: tw.update({ 'created_time': timegm( time.strptime(tw['created_at'], "%a %b %d %H:%M:%S +0000 %Y")) }), twits) #callinf directly instead of wrapper change it later #pass only twitter text & ID only here logger.info('tweets fetched for %s are %s', ID, len(twits)) if (len(twits)): uniqueTweetsFromDB = [] uniqueTweetsFromDB = mongoInt.retrieveParentIdTrue(ID) logger.error('existing uniqueTweetsFromDB :%s', len(uniqueTweetsFromDB)) logger.debug('total uniqe feeds %s', uniqueTweetsFromDB) #twits.extend(uniqueTweetsFromDB) uniqueTweetsFromDB.extend(twits) logger.error('total combined tweets :%s', len(uniqueTweetsFromDB)) #return uniqueTweetsFromDB else: return [] similarTweet = self.topicModelLSI(uniqueTweetsFromDB, Q) # new feeds from service if similarTweet != 0: self.updateRatio(ID, similarTweet, uniqueTweetsFromDB, Q) return self.runClassifier(ID)
def retrieveTweets(self,ID,Q,geoCode): '''retrieveTweets from twitter and store the feeds into MongoDB ''' since_id = mongoInt.retrieveSinceID(ID) #since_id = long(785438635369738240) logger.debug('retrieve tweets') logger.debug(since_id) logger.debug('retrieve tweets123456') #fetch the latest since_id and pass it in next twitter call #since_id = mongoInt.retrieveSinceID(ID) twits = twitterInt.retrieveTweets(Q,geoCode, since_id) mongoInt.collectionFeedFrequency(len(twits), ID) def removeRetweets(tweet): if 'retweeted_status' in tweet: tweet = tweet['retweeted_status'] tweet['alreadyRetweeted'] = True print tweet['text'] #json_obj = json.dumps(obj) return tweet #map(lambda tw:tw.update({'created_time': timegm(time.gmtime(time.strptime(tw['created_at'],"%a %b %d %H:%M:%S +0000 %Y")))}),twits) value = map(removeRetweets, twits) print(value) twits = value map(lambda tw:tw.update({'created_time': timegm(time.strptime(tw['created_at'],"%a %b %d %H:%M:%S +0000 %Y"))}),twits) #callinf directly instead of wrapper change it later #pass only twitter text & ID only here logger.info('tweets fetched are chellaaa %s',len(twits)) if(len(twits)): uniqueTweetsFromDB = mongoInt.retrieveParentIdTrue(ID) logger.debug('existing uniqueTweetsFromDB :%s',len(uniqueTweetsFromDB)) #twits.extend(uniqueTweetsFromDB) uniqueTweetsFromDB.extend(twits) logger.debug('total combined tweets :%s',len(uniqueTweetsFromDB)) similarTweet = self.topicModelLSI(uniqueTweetsFromDB, Q) # new feeds from service if similarTweet != 0: self.updateRatio(ID,similarTweet,uniqueTweetsFromDB, Q) return len(uniqueTweetsFromDB)
def similarTopicRemoval(self,collName,similarTweet,twits, Q): ''' if childId = parentId update mongoDB parentId = true else parent id != child ID & ratio != 1.0 update mongodb parentId = parent id, ratio = ratio analysis ''' uniqueTweetsFromTwitter =[] similarTweetsFromTwitter =[] logger.debug('entering') for childID,parentID,ratio in zip(similarTweet[0],similarTweet[2],similarTweet[1]): for twit in twits: if childID == parentID: if twit['id'] is parentID: twit.update({'parentId' : 1}) uniqueTweetsFromTwitter.append(twit) elif childID != parentID and ratio <= 0.999999: #elif cmp(1,float(ratio)) == 1: #logger.error('%s != %s & ratio : %s',childID,cmp(float(ratio),1),ratio) if twit['id'] is childID: tmpDict = {'parentId' : parentID, 'ratio' : float(ratio)} twit.update(tmpDict) similarTweetsFromTwitter.append(twit) uniqueTweetsFromDB = mongoInt.retrieveParentIdTrue(collName) # similar_id1 = [] # for ele in uniqueTweetsFromDB: # similar_id1.append(ele['id']) logger.debug('existing uniqueTweetsFromDB :%s',len(uniqueTweetsFromDB)) uniqueTweetsFromDB.extend(uniqueTweetsFromTwitter) #<place holder remove the matching tweets between uniqueTweetsFromDB and which is again got from server uniqueTweetsFromTwitter allUniqueTweetsIDList = [] for ele in uniqueTweetsFromDB: allUniqueTweetsIDList.append(ele['id']) logger.debug('Duplication allUniqueTweetsIDList contains len : %s value :%s',len(allUniqueTweetsIDList),allUniqueTweetsIDList) allUniqueTweetsIDList = set(allUniqueTweetsIDList) logger.debug('unique allUniqueTweetsIDList contains len : %s value :%s',len(allUniqueTweetsIDList), allUniqueTweetsIDList) allUniqueTweets = [] for ident in allUniqueTweetsIDList: for ele in uniqueTweetsFromDB: if ele['id'] is ident: allUniqueTweets.append(ele) logger.debug('allUniqueTweets lenght is %s',allUniqueTweets) similarTweet = self.topicModelLSI(allUniqueTweets, Q) uniqueTweetsFromTwitter_1 = [] for childID,parentID,ratio in zip(similarTweet[0],similarTweet[2],similarTweet[1]): for twit in twits : #similar_id = similar_id1 #if twit['id'] not in similar_id: # similar_id.append(twit['id']) if childID == parentID: if twit['id'] is parentID: twit.update({'parentId' : 1}) uniqueTweetsFromTwitter_1.append(twit) # similar_id1.append(twit['id']) # elif childID != parentID and ratio <= 0.999999 and childID not in similar_id1: # elif childID != parentID and ratio <= 0.999999 : # #similar_id.append(childID) # #elif cmp(1,float(ratio)) == 1: # #logger.error('%s != %s & ratio : %s',childID,cmp(float(ratio),1),ratio) # if twit['id'] is childID: # tmpDict = {'parentId' : parentID, 'ratio' : float(ratio)} # twit.update(tmpDict) # similarTweetsFromTwitter.append(twit) for ele in uniqueTweetsFromTwitter_1: logger.debug('current tweets : text: %s, ID: %s',ele['text'],ele['id']) ''' else: #if collection already exists for childID,parentID,ratio in zip(similarTweet[0],similarTweet[2],similarTweet[1]): for twit in twits: if childID == parentID: if twit['id'] is parentID: uniqueTweetsFromTwitter.append(twit) uniqueTweetsFromDB = mongoInt.retrieveParentIdTrue(collName) logger.debug('uniqueTweetsFromDB :%s',uniqueTweetsFromDB) topicModelObj = topicModel.topicModel(uniqueTweetsFromDB) dictionary = topicModelObj.createDictionary() corpus = [] for vector in topicModelObj: corpus.append(vector) logger.debug('chelloi corpus : %s',corpus) similarTweet= topicModelObj.createLSIModel(corpus,uniqueTweetsFromTwitter) logger.debug('chelloi similarTweet : %s',similarTweet) for childID,parentID,ratio in zip(similarTweet[0],similarTweet[2],similarTweet[1]): for twit in twits: if childID == parentID: if twit['id'] is parentID: twit.update({'parentId' : 1}) uniqueTweetsFromTwitter.append(twit) elif childID != parentID and ratio <= 0.999999: #elif cmp(1,float(ratio)) == 1: #logger.error('%s != %s & ratio : %s',childID,cmp(float(ratio),1),ratio) if twit['id'] is childID: tmpDict = {'parentId' : parentID, 'ratio' : float(ratio)} twit.update(tmpDict) uniqueTweetsFromTwitter.append(twit) ''' logger.info('Before revomal %s twits after similar Topic removal: %s',len(twits),len(uniqueTweetsFromTwitter_1)) uniqueTweetsFromTwitter_1.extend(similarTweetsFromTwitter) self.insertFeedData(collName,uniqueTweetsFromTwitter_1) return len(uniqueTweetsFromTwitter) #simple returning the count of unique tweets