def recCursor(lastTimeStamp, counter=0): ''' not an effective method to query across multiple connection ''' for collName in collectionList: logger.debug('collName = %s & time = %s', collName, lastTimeStamp) docLists = mongoInt.retrieveCollection( collName, lastTimeStamp, globalS.dictDb['MONGODB_COUNT_LIMIT']) if globalS.dictDb['APP_DEBUG']: def insertQueryData(twit, *argv): logger.debug('Query details %s', collName) twit.update({'queryDetails123': collName}) map(lambda twit: insertQueryData(twit, ID), docLists) if len(docLists): logger.info( 'fetched %s docs from collection:%s appending to tweets', len(docLists), collName) #print the ID's of feeds so that we verify any dup feeds are obtained map( lambda twit: logger.debug('doc ID is %s', twit[ 'id']), docLists) tweets.extend(docLists) if len(tweets) < 10 and counter < 10000: lastTimeStamp = int( lastTimeStamp) - globalS.dictDb['DELTA_FEEDS_TIME'] logger.info( 'Docs are not available so recursive calling %s', lastTimeStamp) return recCursor(lastTimeStamp, counter + 1) logger.info( 'collectively returned %s docs for multiple documents %s', len(tweets), collectionList) return 1
def fetchInterestFeeds(self,ID): '''fetch the all neo4j interest nodes returning name & city then using those tags look for mongoDb collection if not then do search in twitter & instagram and store the output in mongoDb in a collection mapped to interest nodes''' recordList = neo4jInt.getInterestNode(graphDB,ID) geoDict = {} tweets=[] #parse the recordList and frame the has tags here for record in recordList: if record[0]['lat'] is not None: geoDict.update({'lat':record[0]['lat']}) geoDict.update({'lng':record[0]['lng']}) geoDict.update({'distance':'.5'})#default radius =500m logger.info('recordList output of neo4j:%s',record[0]['name']) if record[0]['city'] is not None: Q=record[0]['name'] +' '+ record[0]['city'] else: Q=record[0]['name'] ID=record[0]['id'] logger.debug('fetchInterestFeeds ID:%s Q=%s geo cordinates =%s',ID,Q,geoDict) if mongoInt.checkCollExists(ID) > 1: tweets.extend(mongoInt.retrieveCollection(ID)) else: tweets.extend(self.retrieveTweets(ID,Q,geoDict)) tweets.extend(self.retrieveMediaBasedTags(ID,Q,geoDict)) geoDict = {}#revert the geo dictionary #sparkInt.Parallelized(tweets) #feedJson=sparkInt.wowFieldTrueOrFalse(tweets) return tweets
def recCursor(lastTimeStamp): logger.debug('collName = %s & time = %s',ID,lastTimeStamp) docList = mongoInt.retrieveCollection(ID,lastTimeStamp,globalS.dictDb['MONGODB_COUNT_LIMIT']) if len(docList) < 2: lastTimeStamp=int(lastTimeStamp)-globalS.dictDb['DELTA_FEEDS_TIME'] logger.info('Docs are not available so recursive calling %s',lastTimeStamp) return recCursor(lastTimeStamp) return docList
def recCursor(lastTimeStamp): for collName in collectionList: logger.debug('collName = %s & time = %s',ID,lastTimeStamp) tweets.extend( mongoInt.retrieveCollection(ID,lastTimeStamp,globalS.dictDb['MONGODB_COUNT_LIMIT'])) if len(tweets) < 2: lastTimeStamp=int(lastTimeStamp)-globalS.dictDb['DELTA_FEEDS_TIME'] logger.info('Docs are not available so recursive calling %s',lastTimeStamp) return recCursor() logger.info('collectively returned %s docs for multiple documents',len(tweets)) return
def retrieveCollection(self,ID,lastTimeStamp,count): ''' for displayFeeds debugging stuff ''' tweets=[] #docs = mongoInt.retrieveCollection(ID,lastTimeStamp,count) #tweets.extend(docs) if docs>0 else 0 tweets.extend(mongoInt.retrieveCollection(ID,lastTimeStamp,count)) if globalS.dictDb['APP_DEBUG']: logger.debug('APP_DEBUG is true so seeting the queryDetails:ID field') def insertQueryData(twit,ID): twit.update({'queryDetails':ID}) #return twit map(lambda twit: insertQueryData(twit, ID), tweets); return tweets
def recCursor(lastTimeStamp): logger.debug('collName = %s & time = %s', ID, lastTimeStamp) docList = mongoInt.retrieveCollection( ID, lastTimeStamp, globalS.dictDb['MONGODB_COUNT_LIMIT']) if len(docList) < 2: lastTimeStamp = int( lastTimeStamp) - globalS.dictDb['DELTA_FEEDS_TIME'] logger.info( 'Docs are not available so recursive calling %s', lastTimeStamp) return recCursor(lastTimeStamp) return docList
def recCursor(lastTimeStamp): for collName in collectionList: logger.debug('collName = %s & time = %s', ID, lastTimeStamp) tweets.extend( mongoInt.retrieveCollection( ID, lastTimeStamp, globalS.dictDb['MONGODB_COUNT_LIMIT'])) if len(tweets) < 2: lastTimeStamp = int( lastTimeStamp) - globalS.dictDb['DELTA_FEEDS_TIME'] logger.info( 'Docs are not available so recursive calling %s', lastTimeStamp) return recCursor() logger.info( 'collectively returned %s docs for multiple documents', len(tweets)) return
def recCursor(lastTimeStamp): ''' not an effective method to query across multiple connection ''' for collName in collectionList: logger.debug('collName = %s & time = %s',collName,lastTimeStamp) docLists = mongoInt.retrieveCollection(collName,lastTimeStamp,globalS.dictDb['MONGODB_COUNT_LIMIT']) if globalS.dictDb['APP_DEBUG']: def insertQueryData(twit,*argv): twit.update({'queryDetails':argv}) map(lambda twit: insertQueryData(twit,ID), docLists); if len(docLists): logger.info('fetched %s docs from collection:%s appending to tweets',len(docLists),collName) #print the ID's of feeds so that we verify any dup feeds are obtained map(lambda twit: logger.debug('doc ID is %s',twit['id']), docLists); tweets.extend(docLists) if len(tweets) < 10: lastTimeStamp=int(lastTimeStamp)-globalS.dictDb['DELTA_FEEDS_TIME'] logger.info('Docs are not available so recursive calling %s',lastTimeStamp) return recCursor(lastTimeStamp) logger.info('collectively returned %s docs for multiple documents %s',len(tweets),collectionList) return 1
def fetchInterestFeeds(self, ID, lastTimeStamp): '''fetch the all neo4j interest nodes returning name & city then using those tags look for mongoDb collection if not then do search in twitter & instagram and store the output in mongoDb in a collection mapped to interest nodes''' recordList = neo4jInt.getInterestNode(graphDB, ID) geoDict = {} tweets = [] jobsArgs = [] #parse the recordList and frame the has tags here for record in recordList: geoDict = {} #revert the geo dictionary if record[0]['lat'] is not None: geoDict.update({'lat': record[0]['lat']}) geoDict.update({'lng': record[0]['lng']}) geoDict.update({'distance': '.5'}) #default radius =500m logger.info('recordList output of neo4j:%s', record[0]['name']) if record[0]['city'] is not None: Q = record[0]['name'] + ' ' + record[0]['city'] else: Q = record[0]['name'] ID = record[0]['id'] logger.debug('fetchInterestFeeds ID:%s Q=%s geo cordinates =%s', ID, Q, geoDict) if mongoInt.checkCollExists(ID) > 1: #docs = mongoInt.retrieveCollection(ID,lastTimeStamp) #tweets.extend(docs) if len(docs) else 0 tweets.extend( mongoInt.retrieveCollection( ID, lastTimeStamp, globalS.dictDb['MONGODB_COUNT_LIMIT'])) else: #tweets.extend(self.retrieveTweets(ID,Q,geoDict)) #tweets.extend(self.retrieveMediaBasedTags(ID,Q,geoDict)) jobsArgs.append([ID, Q, geoDict]) #with Pool(processes=4) as pool: # pool.map() #jobs = [] #job.append(Process(target=self.retrieveTweets, args=(ID,Q,geoDict))) #job.append(Process(target=self.retrieveMediaBasedTags, args=(ID,Q,geoDict))) #feeds = self.retrieveTweets(ID,Q,geoDict) #tweets.extend(feeds) if len(feeds) else 0 #medias = self.retrieveMediaBasedTags(ID,Q,geoDict) #tweets.extend(medias) if len(medias) else 0 ## auxiliary funciton to make it work if len(jobsArgs): logger.warn('Collection is empty invoking worker pools:%s', jobsArgs) def retrieveMedias_helper(args): tweets.extend(self.retrieveMediaBasedTags(*args)[:20]) def retrieveTweets_helper(args): tweets.extend(self.retrieveTweets(*args)[:20]) #pool = Pool(2) #tweets.extend(pool.map(retrieveTweets_helper,jobsArgs)) #tweets.extend(pool.map(retrieveMedias_helper,jobsArgs)) map(retrieveTweets_helper, jobsArgs) map(retrieveMedias_helper, jobsArgs) #pool.close() #pool.join() logger.debug('multiprocessing pool has returned %s feeds', len(tweets)) #tweets = tweets[:20] if globalS.dictDb['APP_DEBUG']: def insertQueryData(twit, *argv): twit.update({'queryDetails': argv}) #return twit map(lambda twit: insertQueryData(twit, ID, Q, geoDict), tweets) #sparkInt.Parallelized(tweets) #feedJson=sparkInt.wowFieldTrueOrFalse(tweets) return tweets
def fetchInterestFeeds(self,ID,lastTimeStamp): '''fetch the all neo4j interest nodes returning name & city then using those tags look for mongoDb collection if not then do search in twitter & instagram and store the output in mongoDb in a collection mapped to interest nodes''' recordList = neo4jInt.getInterestNode(graphDB,ID) geoDict = {} tweets=[] jobsArgs =[] #parse the recordList and frame the has tags here for record in recordList: geoDict = {}#revert the geo dictionary if record[0]['lat'] is not None: geoDict.update({'lat':record[0]['lat']}) geoDict.update({'lng':record[0]['lng']}) geoDict.update({'distance':'.5'})#default radius =500m logger.info('recordList output of neo4j:%s',record[0]['name']) if record[0]['city'] is not None: Q=record[0]['name'] +' '+ record[0]['city'] else: Q=record[0]['name'] ID=record[0]['id'] logger.debug('fetchInterestFeeds ID:%s Q=%s geo cordinates =%s',ID,Q,geoDict) if mongoInt.checkCollExists(ID) > 1: #docs = mongoInt.retrieveCollection(ID,lastTimeStamp) #tweets.extend(docs) if len(docs) else 0 tweets.extend(mongoInt.retrieveCollection(ID,lastTimeStamp,globalS.dictDb['MONGODB_COUNT_LIMIT'])) else: #tweets.extend(self.retrieveTweets(ID,Q,geoDict)) #tweets.extend(self.retrieveMediaBasedTags(ID,Q,geoDict)) jobsArgs.append([ID,Q,geoDict]) #with Pool(processes=4) as pool: # pool.map() #jobs = [] #job.append(Process(target=self.retrieveTweets, args=(ID,Q,geoDict))) #job.append(Process(target=self.retrieveMediaBasedTags, args=(ID,Q,geoDict))) #feeds = self.retrieveTweets(ID,Q,geoDict) #tweets.extend(feeds) if len(feeds) else 0 #medias = self.retrieveMediaBasedTags(ID,Q,geoDict) #tweets.extend(medias) if len(medias) else 0 ## auxiliary funciton to make it work if len(jobsArgs): logger.warn('Collection is empty invoking worker pools:%s',jobsArgs) def retrieveMedias_helper(args): tweets.extend(self.retrieveMediaBasedTags(*args)[:20]) def retrieveTweets_helper(args): tweets.extend(self.retrieveTweets(*args)[:20]) #pool = Pool(2) #tweets.extend(pool.map(retrieveTweets_helper,jobsArgs)) #tweets.extend(pool.map(retrieveMedias_helper,jobsArgs)) map(retrieveTweets_helper,jobsArgs) map(retrieveMedias_helper,jobsArgs) #pool.close() #pool.join() logger.debug('multiprocessing pool has returned %s feeds',len(tweets)) #tweets = tweets[:20] if globalS.dictDb['APP_DEBUG']: def insertQueryData(twit,*argv): twit.update({'queryDetails':argv}) #return twit map(lambda twit: insertQueryData(twit,ID,Q,geoDict), tweets); #sparkInt.Parallelized(tweets) #feedJson=sparkInt.wowFieldTrueOrFalse(tweets) return tweets