예제 #1
0
 def fetchInterestFeeds(self,ID):
     '''fetch the all neo4j interest nodes returning name & city then using those
     tags look for mongoDb collection if not then do search in twitter &
     instagram and store the output in mongoDb in a collection mapped to interest nodes'''
     recordList = neo4jInt.getInterestNode(graphDB,ID)
     geoDict = {}
     tweets=[]
     #parse the recordList and frame the has tags here
     for record in recordList:
         if record[0]['lat'] is not None:
             geoDict.update({'lat':record[0]['lat']})
             geoDict.update({'lng':record[0]['lng']})
             geoDict.update({'distance':'.5'})#default radius =500m
         logger.info('recordList output of neo4j:%s',record[0]['name'])
         if record[0]['city'] is not None:
             Q=record[0]['name'] +' '+ record[0]['city']
         else:
             Q=record[0]['name']
         ID=record[0]['id']
         logger.debug('fetchInterestFeeds ID:%s Q=%s geo cordinates =%s',ID,Q,geoDict)
         if mongoInt.checkCollExists(ID) > 1:
             tweets.extend(mongoInt.retrieveCollection(ID))
         else:
             tweets.extend(self.retrieveTweets(ID,Q,geoDict))
             tweets.extend(self.retrieveMediaBasedTags(ID,Q,geoDict))
             geoDict = {}#revert the geo dictionary
     #sparkInt.Parallelized(tweets)
     #feedJson=sparkInt.wowFieldTrueOrFalse(tweets)
     return tweets
예제 #2
0
def retrieveTwitterAccessTokens(collName):
        ''' retrieve access tokens from DB and pass it to twitterInt
        '''
        if mongoInt.checkCollExists(collName) < 1:
            ''' if the collection twitter_Access_Tokens is not availble initially
            populate the document with default tokens read from cfg files    '''
            logger.debug('default twitter token is:%s',globalS.dictDb['SATHISH_TOKEN'])
            if mongoInt.insertTwitteTokens(collName,globalS.dictDb['SATHISH_TOKEN']):
                logger.warn('twitter_Access_Tokens was empty added default token now')
        tokens = mongoInt.retrieveTwitterTokens(collName)
        logger.debug('tokens retrieved key secret : %s',tokens)
        return tokens
예제 #3
0
def retrieveTwitterAccessTokens(collName = 'twitter_Access_Tokens'):
        ''' retrieve access tokens from DB and pass it to twitterInt
        '''
        if mongoInt.checkCollExists(collName) < 1:
            ''' if the collection twitter_Access_Tokens is not availble initially
            populate the document with default tokens read from cfg files    '''
            logger.debug('default twitter token is:%s',globalS.dictDb['SATHISH_TOKEN'])
            if mongoInt.insertTwitteTokens(collName,globalS.dictDb['SATHISH_TOKEN']):
                logger.warn('twitter_Access_Tokens was empty added default token now')
        tokens = mongoInt.retrieveTwitterTokens(collName)
        logger.debug('tokens retrieved key secerte : %s',tokens)
        return tokens
예제 #4
0
    def fetchInterestFeeds(self,ID,lastTimeStamp):
        '''fetch the all neo4j interest nodes returning name & city then using those
        tags look for mongoDb collection if not then do search in twitter &
        instagram and store the output in mongoDb in a collection mapped to interest nodes'''

        recordList = neo4jInt.getInterestNode(graphDB,ID)
        #intialise the variables
        geoDict = {}
        tweets=[]
        jobsArgs =[]
        collectionList = []

        #parse the recordList and frame the has tags here
        for record in recordList:
            geoDict = {}#revert the geo dictionary
            if record[0]['lat'] is not None:
                geoDict.update({'lat':record[0]['lat']})
                geoDict.update({'lng':record[0]['lng']})
                geoDict.update({'distance':'.5'})#default radius =500m
                logger.info('recordList output of neo4j:%s',record[0]['name'])

            if record[0]['city'] is not None:
                Q=record[0]['name'] +' '+ record[0]['city']
            else:
                Q=record[0]['name']

            ID=record[0]['id']
            logger.debug('fetchInterestFeeds ID:%s Q=%s geo cordinates =%s',ID,Q,geoDict)

            if mongoInt.checkCollExists(ID) > 1:
                collectionList.append(ID)
            else:
                jobsArgs.append([ID,Q,geoDict])


        ## auxiliary funciton to make it work

        #first time login logic to be defined
        if len(collectionList):
            def recCursor(lastTimeStamp):
                ''' not an effective method to query across multiple connection '''
                for collName in collectionList:
                    logger.debug('collName = %s & time = %s',collName,lastTimeStamp)
                    docLists =  mongoInt.retrieveCollection(collName,lastTimeStamp,globalS.dictDb['MONGODB_COUNT_LIMIT'])
                    if globalS.dictDb['APP_DEBUG']:
                        def insertQueryData(twit,*argv):
                            twit.update({'queryDetails':argv})
                        map(lambda twit: insertQueryData(twit,ID), docLists);
                    if len(docLists):
                        logger.info('fetched %s docs from collection:%s appending to tweets',len(docLists),collName)
                        #print the ID's of feeds so that we verify any dup feeds are obtained
                        map(lambda twit: logger.debug('doc ID is %s',twit['id']), docLists);
                        tweets.extend(docLists)
                if len(tweets) < 10:
                    lastTimeStamp=int(lastTimeStamp)-globalS.dictDb['DELTA_FEEDS_TIME']
                    logger.info('Docs are not available so recursive calling %s',lastTimeStamp)
                    return recCursor(lastTimeStamp)
                logger.info('collectively returned %s docs for multiple documents %s',len(tweets),collectionList)
                return 1
            recCursor(lastTimeStamp)
        elif (len(tweets) == 0) and len(jobsArgs):
            logger.warn('Collection is empty invoking worker pools:%s',jobsArgs)

            def retrieveMedias_helper(args):
                tweets.extend(self.retrieveMediaBasedTags(*args))
            def retrieveTweets_helper(args):
                '''commenting this as its taking too much of time'''
                tweets.extend(self.retrieveTweets(*args))
            #map(retrieveTweets_helper,jobsArgs)
            map(retrieveMedias_helper,jobsArgs)
            logger.debug('multiprocessing pool has returned %s feeds',len(tweets))
            #tweets = tweets[:20]
        #sparkInt.Parallelized(tweets)
        #feedJson=sparkInt.wowFieldTrueOrFalse(tweets)
        return tweets
예제 #5
0
    def fetchInterestFeeds(self, ID, lastTimeStamp):
        '''fetch the all neo4j interest nodes returning name & city then using those
        tags look for mongoDb collection if not then do search in twitter &
        instagram and store the output in mongoDb in a collection mapped to interest nodes'''
        recordList = neo4jInt.getInterestNode(graphDB, ID)
        geoDict = {}
        tweets = []
        jobsArgs = []
        #parse the recordList and frame the has tags here
        for record in recordList:
            geoDict = {}  #revert the geo dictionary

            if record[0]['lat'] is not None:
                geoDict.update({'lat': record[0]['lat']})
                geoDict.update({'lng': record[0]['lng']})
                geoDict.update({'distance': '.5'})  #default radius =500m
            logger.info('recordList output of neo4j:%s', record[0]['name'])

            if record[0]['city'] is not None:
                Q = record[0]['name'] + ' ' + record[0]['city']
            else:
                Q = record[0]['name']

            ID = record[0]['id']
            logger.debug('fetchInterestFeeds ID:%s Q=%s geo cordinates =%s',
                         ID, Q, geoDict)

            if mongoInt.checkCollExists(ID) > 1:
                #docs = mongoInt.retrieveCollection(ID,lastTimeStamp)
                #tweets.extend(docs) if len(docs) else 0
                tweets.extend(
                    mongoInt.retrieveCollection(
                        ID, lastTimeStamp,
                        globalS.dictDb['MONGODB_COUNT_LIMIT']))
            else:
                #tweets.extend(self.retrieveTweets(ID,Q,geoDict))
                #tweets.extend(self.retrieveMediaBasedTags(ID,Q,geoDict))
                jobsArgs.append([ID, Q, geoDict])
                #with Pool(processes=4) as pool:
                #    pool.map()
                #jobs = []
                #job.append(Process(target=self.retrieveTweets, args=(ID,Q,geoDict)))
                #job.append(Process(target=self.retrieveMediaBasedTags, args=(ID,Q,geoDict)))
                #feeds = self.retrieveTweets(ID,Q,geoDict)
                #tweets.extend(feeds) if len(feeds) else 0
                #medias = self.retrieveMediaBasedTags(ID,Q,geoDict)
                #tweets.extend(medias) if len(medias) else 0
        ## auxiliary funciton to make it work

        if len(jobsArgs):
            logger.warn('Collection is empty invoking worker pools:%s',
                        jobsArgs)

            def retrieveMedias_helper(args):
                tweets.extend(self.retrieveMediaBasedTags(*args)[:20])

            def retrieveTweets_helper(args):
                tweets.extend(self.retrieveTweets(*args)[:20])

            #pool = Pool(2)
            #tweets.extend(pool.map(retrieveTweets_helper,jobsArgs))
            #tweets.extend(pool.map(retrieveMedias_helper,jobsArgs))
            map(retrieveTweets_helper, jobsArgs)
            map(retrieveMedias_helper, jobsArgs)
            #pool.close()
            #pool.join()
            logger.debug('multiprocessing pool has returned %s feeds',
                         len(tweets))
            #tweets = tweets[:20]
        if globalS.dictDb['APP_DEBUG']:

            def insertQueryData(twit, *argv):
                twit.update({'queryDetails': argv})
                #return twit

            map(lambda twit: insertQueryData(twit, ID, Q, geoDict), tweets)
        #sparkInt.Parallelized(tweets)
        #feedJson=sparkInt.wowFieldTrueOrFalse(tweets)
        return tweets
예제 #6
0
    def fetchInterestFeeds(self,ID,lastTimeStamp):
        '''fetch the all neo4j interest nodes returning name & city then using those
        tags look for mongoDb collection if not then do search in twitter &
        instagram and store the output in mongoDb in a collection mapped to interest nodes'''

        recordList = neo4jInt.getInterestNode(graphDB,ID)
        #intialise the variables
        geoDict = {}
        tweets=[]
        jobsArgs =[]
        collectionList = []

        #parse the recordList and frame the has tags here
        for record in recordList:
            geoDict = {}#revert the geo dictionary
            if record[0]['lat'] is not None:
                geoDict.update({'lat':record[0]['lat']})
                geoDict.update({'lng':record[0]['lng']})
                geoDict.update({'distance':'.5'})#default radius =500m
                logger.info('recordList output of neo4j:%s',record[0]['name'])

            if record[0]['city'] is not None:
                Q=record[0]['name'] +' '+ record[0]['city']
            else:
                Q=record[0]['name']

            ID=record[0]['id']
            logger.debug('fetchInterestFeeds ID:%s Q=%s geo cordinates =%s',ID,Q,geoDict)

            if mongoInt.checkCollExists(ID) > 1:
                collectionList.append(ID)

            else:
                jobsArgs.append([ID,Q,geoDict])

            if globalS.dictDb['APP_DEBUG']:
                def insertQueryData(twit,*argv):
                    twit.update({'queryDetails':argv})
                    #return twit
            map(lambda twit: insertQueryData(twit,ID,Q,geoDict), tweets);
        ## auxiliary funciton to make it work

        #first time login logic to be defined
        if len(collectionList):
            def recCursor(lastTimeStamp):
                for collName in collectionList:
                    logger.debug('collName = %s & time = %s',ID,lastTimeStamp)
                    tweets.extend( mongoInt.retrieveCollection(ID,lastTimeStamp,globalS.dictDb['MONGODB_COUNT_LIMIT']))
                if len(tweets) < 2:
                    lastTimeStamp=int(lastTimeStamp)-globalS.dictDb['DELTA_FEEDS_TIME']
                    logger.info('Docs are not available so recursive calling %s',lastTimeStamp)
                    return recCursor()
                logger.info('collectively returned %s docs for multiple documents',len(tweets))
                return
            recCursor()
        elif len(jobsArgs):
            logger.warn('Collection is empty invoking worker pools:%s',jobsArgs)

            def retrieveMedias_helper(args):
                tweets.extend(self.retrieveMediaBasedTags(*args)[:20])
            def retrieveTweets_helper(args):
                '''commenting this as its taking too much of time'''
                tweets.extend(self.retrieveTweets(*args)[:20])
            ##map(retrieveTweets_helper,jobsArgs)
            ##map(retrieveMedias_helper,jobsArgs)
            logger.debug('multiprocessing pool has returned %s feeds',len(tweets))
            #tweets = tweets[:20]
        #sparkInt.Parallelized(tweets)
        #feedJson=sparkInt.wowFieldTrueOrFalse(tweets)


        return tweets
예제 #7
0
    def fetchInterestFeeds(self,ID,lastTimeStamp):
        '''fetch the all neo4j interest nodes returning name & city then using those
        tags look for mongoDb collection if not then do search in twitter &
        instagram and store the output in mongoDb in a collection mapped to interest nodes'''
        recordList = neo4jInt.getInterestNode(graphDB,ID)
        geoDict = {}
        tweets=[]
        jobsArgs =[]
        #parse the recordList and frame the has tags here
        for record in recordList:
            geoDict = {}#revert the geo dictionary

            if record[0]['lat'] is not None:
                geoDict.update({'lat':record[0]['lat']})
                geoDict.update({'lng':record[0]['lng']})
                geoDict.update({'distance':'.5'})#default radius =500m
            logger.info('recordList output of neo4j:%s',record[0]['name'])

            if record[0]['city'] is not None:
                Q=record[0]['name'] +' '+ record[0]['city']
            else:
                Q=record[0]['name']

            ID=record[0]['id']
            logger.debug('fetchInterestFeeds ID:%s Q=%s geo cordinates =%s',ID,Q,geoDict)

            if mongoInt.checkCollExists(ID) > 1:
                #docs = mongoInt.retrieveCollection(ID,lastTimeStamp)
                #tweets.extend(docs) if len(docs) else 0
                tweets.extend(mongoInt.retrieveCollection(ID,lastTimeStamp,globalS.dictDb['MONGODB_COUNT_LIMIT']))
            else:
                #tweets.extend(self.retrieveTweets(ID,Q,geoDict))
                #tweets.extend(self.retrieveMediaBasedTags(ID,Q,geoDict))
                jobsArgs.append([ID,Q,geoDict])
                #with Pool(processes=4) as pool:
                #    pool.map()
                #jobs = []
                #job.append(Process(target=self.retrieveTweets, args=(ID,Q,geoDict)))
                #job.append(Process(target=self.retrieveMediaBasedTags, args=(ID,Q,geoDict)))
                #feeds = self.retrieveTweets(ID,Q,geoDict)
                #tweets.extend(feeds) if len(feeds) else 0
                #medias = self.retrieveMediaBasedTags(ID,Q,geoDict)
                #tweets.extend(medias) if len(medias) else 0
        ## auxiliary funciton to make it work

        if len(jobsArgs):
            logger.warn('Collection is empty invoking worker pools:%s',jobsArgs)

            def retrieveMedias_helper(args):
                tweets.extend(self.retrieveMediaBasedTags(*args)[:20])
            def retrieveTweets_helper(args):
                tweets.extend(self.retrieveTweets(*args)[:20])
            #pool = Pool(2)
            #tweets.extend(pool.map(retrieveTweets_helper,jobsArgs))
            #tweets.extend(pool.map(retrieveMedias_helper,jobsArgs))
            map(retrieveTweets_helper,jobsArgs)
            map(retrieveMedias_helper,jobsArgs)
            #pool.close()
            #pool.join()
            logger.debug('multiprocessing pool has returned %s feeds',len(tweets))
            #tweets = tweets[:20]
        if globalS.dictDb['APP_DEBUG']:
            def insertQueryData(twit,*argv):
                twit.update({'queryDetails':argv})
                #return twit
            map(lambda twit: insertQueryData(twit,ID,Q,geoDict), tweets);
        #sparkInt.Parallelized(tweets)
        #feedJson=sparkInt.wowFieldTrueOrFalse(tweets)
        return tweets