示例#1
0
 def retrieveTweets(self,ID,Q,geoCode):
     '''retrieveTweets from twitter and store the feeds into MongoDB
     '''
     since_id = mongoInt.retrieveSinceID(ID)
     #since_id = long(785438635369738240)
     logger.debug('retrieve tweets')
     logger.debug(since_id)
     logger.debug('retrieve tweets123456')`
     #fetch the latest since_id and pass it in next twitter call
     #since_id = mongoInt.retrieveSinceID(ID)
     twits = twitterInt.retrieveTweets(Q,geoCode, since_id)
     
     mongoInt.collectionFeedFrequency(len(twits), ID)
     
     #map(lambda tw:tw.update({'created_time': timegm(time.gmtime(time.strptime(tw['created_at'],"%a %b %d %H:%M:%S +0000 %Y")))}),twits)
     map(lambda tw:tw.update({'created_time': timegm(time.strptime(tw['created_at'],"%a %b %d %H:%M:%S +0000 %Y"))}),twits)
     
     #callinf directly instead of wrapper change it later
     #pass only twitter text & ID only here
     logger.info('tweets fetched are %s',twits)
     #similarTweet = self.topicModelLSI(twits, Q) # new feeds from service
     similarTweet = self.posAnalysis(twits)
     #topicModelObj.close()
     #map(lambda tw:tw.update({'created_time': int(time.mktime(time.strptime(tw['created_at'],"%a %b %d %H:%M:%S +0000 %Y")))}),twits)
     #map(lambda tw:tw.update({'created_time': int(time.gmtime(time.strptime(tw['created_at'],"%a %b %d %H:%M:%S +0000 %Y")))}),twits)
     #twits = twitterInt.retrieveTweetsBasedHashtag(Q)
     #if geoCode:
     #    twits.extend(twitterInt.retrieveTweetBasedLocation(geoCode))
     #logger.debug('storing tweets of twitter of both location based on keyword mongoDb')
     #twits=sparkInt.wowFieldTrueOrFalse(twits)
     #self.similarTopicRemoval(ID,similarTweet,twits, Q)
     #self.insertFeedData(ID,twits)
     #page_sanitized = json_util.dumps(twits)
     # below returning to be removed has to be done from mongoDB only
     return len(twits)
示例#2
0
 def retrieveTweets(self, ID, Q, geoCode):
     '''retrieveTweets from twitter and store the feeds into MongoDB
     '''
     passCnt = 0
     logger.debug('retrieve tweets')
     #fetch the latest since_id and pass it in next twitter call
     #since_id = mongoInt.retrieveSinceID(ID)
     twits = twitterInt.retrieveTweets(Q, geoCode)
     map(
         lambda tw: tw.update({
             'created_time':
             timegm(
                 time.gmtime(
                     time.strptime(tw['created_at'],
                                   "%a %b %d %H:%M:%S +0000 %Y")))
         }), twits)
     #map(lambda tw:tw.update({'created_time': int(time.gmtime(time.strptime(tw['created_at'],"%a %b %d %H:%M:%S +0000 %Y")))}),twits)
     #twits = twitterInt.retrieveTweetsBasedHashtag(Q)
     #if geoCode:
     #    twits.extend(twitterInt.retrieveTweetBasedLocation(geoCode))
     logger.debug(
         'storing tweets of twitter of both location based on keyword mongoDb'
     )
     #twits=sparkInt.wowFieldTrueOrFalse(twits)
     if len(twits):
         passCnt += mongoInt.insertFeedData(ID, twits)
     else:
         if not mongoInt.createCollection(ID):
             logger.warn('unable to create collection in mongodb')
     #page_sanitized = json_util.dumps(twits)
     # below returning to be removed has to be done from mongoDB only
     return twits
示例#3
0
    def retrieveTweets(self, ID, Q, geoCode):
        '''retrieveTweets from twitter and store the feeds into MongoDB
        '''
        since_id = mongoInt.retrieveSinceID(ID)
        #since_id = long(785438635369738240)
        logger.debug('retrieve tweets')
        logger.debug(since_id)
        logger.debug('retrieve tweets123456')
        #fetch the latest since_id and pass it in next twitter call
        #since_id = mongoInt.retrieveSinceID(ID)
        twits = twitterInt.retrieveTweets(Q, geoCode, since_id)

        mongoInt.collectionFeedFrequency(len(twits), ID)

        def removeRetweets(tweet):
            if 'retweeted_status' in tweet:
                tweet = tweet['retweeted_status']
                tweet['alreadyRetweeted'] = True
                print tweet['text']
                #json_obj = json.dumps(obj)
            return tweet

        #map(lambda tw:tw.update({'created_time': timegm(time.gmtime(time.strptime(tw['created_at'],"%a %b %d %H:%M:%S +0000 %Y")))}),twits)
        value = map(removeRetweets, twits)
        twits = value
        map(
            lambda tw: tw.update({
                'created_time':
                timegm(
                    time.strptime(tw['created_at'],
                                  "%a %b %d %H:%M:%S +0000 %Y"))
            }), twits)
        #callinf directly instead of wrapper change it later
        #pass only twitter text & ID only here
        logger.info('tweets fetched for %s are %s', ID, len(twits))
        if (len(twits)):
            uniqueTweetsFromDB = []
            uniqueTweetsFromDB = mongoInt.retrieveParentIdTrue(ID)
            logger.error('existing uniqueTweetsFromDB :%s',
                         len(uniqueTweetsFromDB))
            logger.debug('total uniqe feeds %s', uniqueTweetsFromDB)
            #twits.extend(uniqueTweetsFromDB)
            uniqueTweetsFromDB.extend(twits)
            logger.error('total combined tweets :%s', len(uniqueTweetsFromDB))
            #return uniqueTweetsFromDB
        else:
            return []

        similarTweet = self.topicModelLSI(uniqueTweetsFromDB,
                                          Q)  # new feeds from service
        if similarTweet != 0:
            self.updateRatio(ID, similarTweet, uniqueTweetsFromDB, Q)
        return self.runClassifier(ID)
示例#4
0
    def retrieveTweets(self,ID,Q,geoCode):
        '''retrieveTweetsBasedHashtag from twitter
        '''
        passCnt = 0
        logger.debug('retrieve tweets')

        twits = twitterInt.retrieveTweets(Q,geoCode)
        #twits = twitterInt.retrieveTweetsBasedHashtag(Q)
        #if geoCode:
        #    twits.extend(twitterInt.retrieveTweetBasedLocation(geoCode))
        logger.debug('storing tweets of twitter of both location baseed on keyworad mongoDb')
        #twits=sparkInt.wowFieldTrueOrFalse(twits)
        passCnt += mongoInt.insertFeedData(ID,twits)
        #page_sanitized = json_util.dumps(twits)
        # below returning to be removed has to be done from mongoDB only
        return twits
示例#5
0
    def retrieveTweets(self,ID,Q,geoCode):
        '''retrieveTweets from twitter and store the feeds into MongoDB
        '''
        since_id = mongoInt.retrieveSinceID(ID)
        #since_id = long(785438635369738240)
        logger.debug('retrieve tweets')
        logger.debug(since_id)
        logger.debug('retrieve tweets123456')
        #fetch the latest since_id and pass it in next twitter call
        #since_id = mongoInt.retrieveSinceID(ID)
        twits = twitterInt.retrieveTweets(Q,geoCode, since_id)
        
        mongoInt.collectionFeedFrequency(len(twits), ID)
        def removeRetweets(tweet):
            if 'retweeted_status' in tweet:
                tweet = tweet['retweeted_status']
                tweet['alreadyRetweeted'] = True
                print tweet['text']
                #json_obj = json.dumps(obj)
            return tweet

        
        #map(lambda tw:tw.update({'created_time': timegm(time.gmtime(time.strptime(tw['created_at'],"%a %b %d %H:%M:%S +0000 %Y")))}),twits)
        value = map(removeRetweets, twits)
        print(value)
        twits = value
        map(lambda tw:tw.update({'created_time': timegm(time.strptime(tw['created_at'],"%a %b %d %H:%M:%S +0000 %Y"))}),twits)
        #callinf directly instead of wrapper change it later
        #pass only twitter text & ID only here
        logger.info('tweets fetched are chellaaa %s',len(twits))
        if(len(twits)):
            uniqueTweetsFromDB = mongoInt.retrieveParentIdTrue(ID)
            logger.debug('existing uniqueTweetsFromDB :%s',len(uniqueTweetsFromDB))
            #twits.extend(uniqueTweetsFromDB)
            uniqueTweetsFromDB.extend(twits)
            logger.debug('total combined tweets :%s',len(uniqueTweetsFromDB))
            similarTweet = self.topicModelLSI(uniqueTweetsFromDB, Q) # new feeds from service
            if similarTweet != 0:
                self.updateRatio(ID,similarTweet,uniqueTweetsFromDB, Q)
            return len(uniqueTweetsFromDB)
示例#6
0
 def retrieveTweets(self,ID,Q,geoCode):
     '''retrieveTweets from twitter and store the feeds into MongoDB
     '''
     passCnt = 0
     logger.debug('retrieve tweets')
     #fetch the latest since_id and pass it in next twitter call
     #since_id = mongoInt.retrieveSinceID(ID)
     twits = twitterInt.retrieveTweets(Q,geoCode)
     map(lambda tw:tw.update({'created_time': str(int(time.mktime(time.strptime(tw['created_at'],"%a %b %d %H:%M:%S +0000 %Y"))))}),twits)
     #twits = twitterInt.retrieveTweetsBasedHashtag(Q)
     #if geoCode:
     #    twits.extend(twitterInt.retrieveTweetBasedLocation(geoCode))
     logger.debug('storing tweets of twitter of both location based on keyword mongoDb')
     #twits=sparkInt.wowFieldTrueOrFalse(twits)
     if len(twits):
         passCnt += mongoInt.insertFeedData(ID,twits)
     else:
         if not mongoInt.createCollection(ID):
             logger.warn('unable to create collection in mongodb')
     #page_sanitized = json_util.dumps(twits)
     # below returning to be removed has to be done from mongoDB only
     return twits