def createdocwithid(): foo = Couch('localhost', '5984') print "\nCreate a document, using an assigned docId:" doc = """ { "value": { "Subject":"I like Planktion", "Author":"Rusty", "PostedDate":"2006-08-15T17:30:12-04:00", "Tags":["plankton", "baseball", "decisions"], "Body":"I decided today that I don't like baseball. I like plankton." } } """ foo.saveDoc('mydb', doc)
def test(self): couch = Couch() couch.populate() things = couch.count() self.failIf(things != 6)
class Crawler(StreamListener): def __init__(self, config, logger): self.logger = logger self.couchConfig = config['couch'] self.twt_db = Couch(self.couchConfig,'twt_db',self.logger) self.user_db = Couch(self.couchConfig,'user_db',self.logger) self.crawlerConfig = config['tweet_extractor'] self.api = twitter_setup(self.crawlerConfig) self.api.verify_credentials() logger.info("Authentication OK") self.twitterStream = None self.polygon = polygon_list(self.crawlerConfig['POLYGON']) self.languages = lang_list(self.crawlerConfig['LANG']) self.searchCity = self.crawlerConfig['CITY'] self.searchState = self.crawlerConfig['STATE'] self.searchTerms = self.crawlerConfig['SEARCH_TERMS'] self.query = self.searchTerms.replace(","," OR ") self.searchTermsList = self.searchTerms.split(',') self.searchRadius = self.crawlerConfig['GEOCODE'] self.filterKeys = ["created_at","id","id_str","full_text","coordinates","place","lang","city","state","place_name","neighborhood"] user = ["id","id_str","created_at","name","screen_name","location","time_zone","statuses_count","followers_count","url"] user_keys = ["user."+k for k in user] self.filterKeys.extend(user_keys) self.q = PriorityQueue() self.oldTweetSize = 500000 self.oldTweetSearch = self.crawlerConfig['OLD_SEARCH'] self.followerLimit = 300 self.history = {'tweet': set(),'user_q': set()} # Stats variables self.twtCount = 0 self.validTwtCount = 0 self.totCount = 0 self.whichStats = Counter({"0":0,"1":0,"2":0}) self.currentDay = datetime.datetime.now().day def check_location(self, tweet): flag = False userloc = tweet.author.location if tweet.author.location else "" json_tweet = tweet._json city_regex = re.compile("(perth)|(brisbane)|(melbourne)|(sydney)|(adelaide)|(gold coast)|(hobart)") state_regex = re.compile("(victoria)|(western australia)|(queensland)|(new south wales)|(south australia)|(northern territory)|(tasmania)") polygon = box(self.polygon[0], self.polygon[1],self.polygon[2], self.polygon[3]) state = None city = None neighborhood = None place_name = None if tweet.coordinates or tweet.place or tweet.author.location: if tweet.coordinates: point = tweet.coordinates['coordinates'] p = Point(point[0], point[1]) flag = polygon.contains(p) state = self.searchState if tweet.place and not flag: if tweet.place.place_type == 'poi': point = list(tweet.place.bounding_box.origin()) place_name = tweet.place.full_name.lower() place = {'type': 'Point','coordinates': point} json_tweet['coordinates'] = place p = Point(point[0], point[1]) flag = polygon.contains(p) state = self.searchState if tweet.place: if tweet.place.place_type == 'city': city = tweet.place.name.lower() m = state_regex.search(tweet.place.full_name.lower()) if m: state = m.group(0) flag = True if tweet.place.place_type == 'neighborhood': cm = city_regex.search(tweet.place.full_name.lower()) if cm: city = cm.group(0) neighborhood = tweet.place.name.lower() flag = True if not flag and tweet.author.location: cm = city_regex.search(userloc.lower()) sm = state_regex.search(userloc.lower()) if cm: city = cm.group(0) flag = True if sm: state = sm.group(0) flag = True # Workaround because Perth is named differently in the places section. if city: city = 'perth' if 'perth' in city.lower() else city json_tweet['state'] = state.lower() if state else state json_tweet['city'] = city.lower() if city else city json_tweet['place_name'] = place_name.lower() if place_name else place_name json_tweet['neighborhood'] = neighborhood.lower() if neighborhood else neighborhood return flag,json_tweet def add_user_to_queue(self, user, flag, pipe): if int(user['id_str']) not in self.history['user_q']: self.history['user_q'].add(int(user['id_str'])) self.q.put((flag, user['id_str'])) self.logger.info( f"Pipe: {pipe} | Added User ID: {user['id_str']} to Queue") def on_connect(self): """ Alert when connection is established. """ self.logger.info("Pipe: Stream | Connected to Twitter Streaming...") def on_status(self, status): """ Filtering of tweets """ try: if self.tweet_processor(status, 0, "Stream"): self.logger.debug( f"Pipe: Stream | Added Tweet ID: {status.id} by User ID: {status.author.id}" ) except Exception as e: self.logger.exception(e) self.logger.info( f"Pipe: Stream | Error processing tweet ID: {status.id}... Skipping....") def on_error(self, status_code): """ Currently Stopping for only rate limit error. """ if status_code == 420: self.logger.error("Exceeded request rate and being limited.") return False def on_exception(self, ex): """ Handle Exception for logging purposes """ self.logger.exception(ex) def disconnect(self): """ Disconnect Twitter Stream """ if self.twitterStream is not None: self.twitterStream.disconnect() self.twitterStream = None def tweet_processor(self, tweet, flag, pipe): self.totCount += 1 self.logger.debug( f"Pipe: {pipe} | Processing Tweet ID: {tweet.id} | User ID: {tweet.author.id}") # Consider the original Tweet of a retweet. if hasattr(tweet,'retweeted_status'): tweet = tweet.retweeted_status if tweet.id_str not in self.twt_db.db and tweet.id not in self.history['tweet']: self.history['tweet'].add(tweet.id) valid,json_tweet = self.check_location(tweet) json_tweet = filter_tweet(json_tweet, self.filterKeys) if valid and check_relevance(json_tweet['full_text'],self.searchTermsList): # Add relevant tweet to database with relevant tags if valid: json_tweet['keywords'],json_tweet['hashtags'] = extract_keywords(json_tweet['full_text']) json_tweet['sentiment'] = sentiment(json_tweet['full_text']) json_tweet['relevance'] = True if self.twt_db.save(json_tweet): self.logger.info(f'Pipe: {pipe} | Saving Tweet ID: {json_tweet["id"]} | Database: twt_db') self.twtCount += 1 self.validTwtCount += 1 else: json_tweet['relevance'] = False # Add tweet to database with normal tags if self.twt_db.save(json_tweet): self.logger.info(f'Pipe: {pipe} | Saving Tweet ID: {json_tweet["id"]} | Database: twt_db') self.twtCount += 1 self.add_user_to_queue(json_tweet['user'], flag, pipe) self.logger.info(f'Pipe: {pipe} | Count: Valid - {self.validTwtCount} | {self.searchState} - {self.twtCount} | Total - {self.totCount}') return True else: return False else: self.logger.debug( f"Pipe: {pipe} | Skipping Tweet ID: {tweet.id} as already processed.") return False def download_stream(self): self.logger.info( "Pipe: Stream | Initializing Twitter Streaming pipeline......") self.twitterStream = Stream(self.api.auth, self, tweet_mode='extended') self.twitterStream.filter(locations=self.polygon,track=self.searchTermsList, is_async=True) def download_search(self): query = self.query geocode = self.searchRadius twtPerQuery = 100 current = self.validTwtCount self.logger.info( f"Pipe: Search | Initializing Twitter Search pipeline...") for tweet in Cursor( self.api.search, q=query, count=twtPerQuery, geocode=geocode, tweet_mode='extended', exclude_replies=True).items(): try: if self.tweet_processor(tweet, 1, "Search"): self.logger.debug( f"Pipe: Search | Added Tweet ID: {tweet.id} by User ID: {tweet.author.id}" ) except Exception as e: self.logger.exception(e) self.logger.info( f"Pipe: Search | Error processing tweet ID: {tweet.id} ... Skipping....") count = self.validTwtCount - current self.logger.info( f"Pipe: Search | Closing Twitter Search pipeline with {count} valid tweets......") def download_user(self): self.logger.info( "Pipe: User | Initializing Twitter User Timeline pipeline......") while True: slot,user_id = self.q.get(block=True,timeout=None) user_id = int(user_id) relevant = False item_count = 0 if not self.user_db.db.get(str(user_id)): data = {'id_str':str(user_id)} if self.user_db.save(data): self.logger.info( f"Pipe: User | Added user {user_id} to db | Left: {self.q.qsize()} Users") for tweet in Cursor(self.api.user_timeline, user_id=user_id, count=200, tweet_mode='extended', exclude_replies=True).items(): item_count += 1 try: if self.tweet_processor(tweet, 1, "User"): self.whichStats[str(slot)] += 1 self.logger.debug( f"Pipe: User | Added Tweet ID: {tweet.id} by User ID: {tweet.author.id}" ) relevant = True except Exception as e: self.logger.exception(e) self.logger.info( f"Pipe: User | Error processing tweet ID: {tweet.id} ... Skipping....") if item_count == self.followerLimit and not relevant: self.logger.info(f"Pipe: User | User ID: {user_id} No relevant tweets found.") break if relevant: for follower in Cursor(self.api.followers_ids,user_id=user_id).items(self.followerLimit): self.add_user_to_queue({'id_str':str(follower)},2,"User") else: self.logger.debug( f"Pipe: User | Already Processed User ID: {user_id} ... Skipping....") now = datetime.datetime.now().day if now != self.currentDay: self.currentDay = now self.logger.info( f"Pipe: User | Reseting pipeline as day shifted.") break def download_old_tweets(self): self.logger.info( "Pipe: Old | Initializing Twitter Old Tweets pipeline......") query = self.searchTermsList for term in query: self.logger.info( f"Pipe: Old | Searching term: {term}.....") try: tweetCriteria = got.manager.TweetCriteria().setQuerySearch(term)\ .setSince("2020-01-01")\ .setUntil("2020-05-01")\ .setNear(self.oldTweetSearch)\ .setWithin('50km')\ .setMaxTweets(self.oldTweetSize) tweets = got.manager.TweetManager.getTweets(tweetCriteria) for tweet in tweets: self.add_user_to_queue({'id_str':str(tweet.author_id)},1,"Old") except SystemExit: print('Pipe: Old | Reached limit exiting...') break def download_tweet_list(self,file): self.logger.info( "Pipe: Tweet IDs | Initializing Tweet downloading pipeline......") id_list = [] with open(file,'r') as f: for line in f: line = line.replace("\n","") id_list.append(int(line)) count = len(id_list) valid = 0 id_chunks = [id_list[i:i + 100] for i in range(0, len(id_list), 100)] for chunk in id_chunks: try: tweets = self.api.statuses_lookup(chunk) except Exception as e: self.logger.exception(e) for tweet in tweets: try: if self.tweet_processor(tweet,1,"Tweet IDs"): valid += 1 except Exception as e: self.logger.exception(e) self.logger.info( f"Pipe: Tweet IDs | Error processing tweet ID: {tweet.id} ... Skipping....") self.logger.info( f"Pipe: Tweet IDs | Finishing with: {valid} tweets out of {count}.") def start_pipeline(self): try: while True: try: self.download_stream() self.download_search() self.download_old_tweets() # self.download_tweet_list('../../test/twtid.txt') self.download_user() except error.TweepError as e: self.logger.exception(e) self.disconnect() time.sleep(50) except Exception as e: self.logger.exception(e) self.disconnect() time.sleep(50) break except KeyboardInterrupt: self.logger.info(f"Stats {self.whichStats}") self.disconnect() self.logger.info('Stopping Crawler......')
def test_couch(self): couch = Couch() couch.populate() things = couch.count() self.failIf(things != 5)
def update(): foo = Couch('localhost', '5984') print "\nUpdate document 'mydoc' in database 'mydb':" foo.update('mydb', doc, 'mydoc')
def deletedoc(): foo = Couch('localhost', '5984') print "\nDelete document 'mydoc' in database 'mydb':" foo.deleteDoc('mydb','mydoc')
def deletedb(): foo = Couch('localhost', '5984') print "\nDelete database 'mydb':" foo.deleteDb('mydb')
def dbinfo(): foo = Couch('localhost', '5984') print "\nList info about database 'mydb':" foo.infoDb('mydb')
def retrievedoc(): foo = Couch('localhost', '5984') print "\nRetrieve document 'mydoc' in database 'mydb':" foo.openDoc('mydb', 'mydoc')
def getDoc(): foo = Couch('localhost', '5984') print "\nGet document in database 'mydb' by id" foo.getDoc('mydb','mydoc') print "\nGet documents in database 'mydb' by keys and values" foo.getDoc('mydb',"Author","Rusty")
def listalldoc(): foo = Couch('localhost', '5984') print "\nList all documents in database 'mydb'" foo.listDoc('mydb')
def listDB(): foo = Couch('localhost', '5984') print "\nList databases on server:" foo.listDb()
def createdb(): foo = Couch('localhost', '5984') print "\nCreate database 'mydb':" foo.createDb('mydb')
thread.join(self.timeout) if thread.is_alive(): print "terminating process" self.process.terminate() thread.join() # timeout sees -15, normal sees 0x log.returnCode = self.process.returncode jsonString = json.dumps(log, default=lambda o: o.__dict__) print jsonString db.saveDoc('traceroute', jsonString) f = open('urllist.txt', 'r') db = Couch('localhost', '5984') db.createDb('traceroute') logFile = open(str(strftime("%Y-%m-%d %H:%M:%S", gmtime())), 'w') for line in f: if line[0] == '#' or line[0] == '\n' or len(line) == 0: continue url = urlparse(line.partition(" ")[0]) print "url :", url.netloc print "comments:", line[line.index(' ') + 1:] command = Command(['traceroute', '-a', '-q 5', url.netloc], 20) command.run() logFile.close()
config = Config() bot = TwitBot(user=config.user, consumer_key=config.consumer_key, consumer_secret=config.consumer_secret, access_token_key=config.access_token_key, access_token_secret=config.access_token_secret) import tweepy config = Config() auth = tweepy.OAuthHandler(config.consumer_key, config.consumer_secret) auth.set_access_token(config.access_token_key, config.access_token_secret) api = tweepy.API( auth) # Don't forget to use authentication for private lists/users. from couch import Couch couch = Couch(db="diabetes") users = [] #SSEMt # listnames = ['ClubDiabetes','ClubDiabetes1','ClubDiabetes2','ClubDiabetes3'] listnames = ['ClubDiabetes'] for listname in listnames: # Iterate through all members of the owner's list for member in tweepy.Cursor(api.list_members, 'DiabeticFury', listname).items(): users.append(member.id) print member.screen_name # tweets = bot.Request(bot.Timeline, bot, member.screen_name, count=100, since=self.couch.MaxTweetId()) tweets = bot.History(member.screen_name) print len(tweets) if len(tweets) > 0:
#!/usr/local/bin/python import sys, os, os.path dirname = os.path.dirname(os.path.abspath(__file__)) os.chdir(dirname) sys.path.append('../') from couch import Couch if __name__ == "__main__": db = sys.argv[1] if len(sys.argv) > 1 else "" couch = Couch(db=db) print "\nIndexing..." print couch.Index('user.screen_name') print couch.Index('lang') print couch.IndexDate('created_at') print ""