def watchlist(self): '''Watch a fixed list of user_id''' # Clean up database first db = misc.mysql_db(self.config['db_server'], self.config['db_username'], self.config['db_password'], self.config['db_database'], self.logger) stmt = 'DELETE FROM target_users' db.execute(stmt) # TODO still have problem stmt = ('LOAD DATA LOCAL INFILE "seed.lst" INTO TABLE target_users ' 'FIELDS TERMINATED BY \"\\t\" LINES TERMINATED BY \"\\n\"') db.execute(stmt) # Get that list first self.crawl('seed.lst') # Get that list's friend second stmt = ('SELECT DISTINCT friend_id FROM friends, target_users ' 'WHERE friends.user_id = target_users.user_id') db.execute(stmt) results = db.cursor.fetchall() db.__del__() misc.write_to_files(results, 'initial_friends', self.config['seed_per_file'], 'utf') # Enter the generate-crawl-update loop self.twalerloop()
def generate(self): try: # TODO improve our naive seed generation method #--------------------------------- # select the top _seed_limit_ most re-occuring friend that # has yet to be crawled stmt = ("SELECT friend_id FROM friends WHERE friend_id NOT IN " "(SELECT user_id FROM users_update) " "group by friend_id order by count(*) desc limit %s" % self.config['seed_limit']) self.db.execute(stmt) self.logger.debug("MySQL generate_users Query Complete") seedType = 'utf' #--------------------------------- # fetch results and write to new seed file results = self.db.cursor.fetchall() timestamp = misc.timefunctions.datestamp() misc.write_to_files(results, 'seeds_' + timestamp, self.config['seed_per_file'], seedType) except Exception as e: traceback.print_stack() self.logger.error(str(e))