Пример #1
0
 def watchlist(self):
     '''Watch a fixed list of user_id'''
     # Clean up database first
     db = misc.mysql_db(self.config['db_server'],
                        self.config['db_username'],
                        self.config['db_password'],
                        self.config['db_database'], self.logger)
     stmt = 'DELETE FROM target_users'
     db.execute(stmt)
     # TODO still have problem
     stmt = ('LOAD DATA LOCAL INFILE "seed.lst" INTO TABLE target_users '
             'FIELDS TERMINATED BY \"\\t\" LINES TERMINATED BY \"\\n\"')
     db.execute(stmt)
     # Get that list first
     self.crawl('seed.lst')
     # Get that list's friend second
     stmt = ('SELECT DISTINCT friend_id FROM friends, target_users '
             'WHERE friends.user_id = target_users.user_id')
     db.execute(stmt)
     results = db.cursor.fetchall()
     db.__del__()
     misc.write_to_files(results, 'initial_friends',
                         self.config['seed_per_file'], 'utf')
     # Enter the generate-crawl-update loop
     self.twalerloop()
Пример #2
0
    def generate(self):
        try:
            # TODO improve our naive seed generation method
            #---------------------------------
            # select the top _seed_limit_ most re-occuring friend that
            # has yet to be crawled
            stmt = ("SELECT friend_id FROM friends WHERE friend_id NOT IN "
                    "(SELECT user_id FROM users_update) "
                    "group by friend_id order by count(*) desc limit %s" %
                    self.config['seed_limit'])
            self.db.execute(stmt)
            self.logger.debug("MySQL generate_users Query Complete")
            seedType = 'utf'

            #---------------------------------
            # fetch results and write to new seed file
            results = self.db.cursor.fetchall()
            timestamp = misc.timefunctions.datestamp()
            misc.write_to_files(results, 'seeds_' + timestamp,
                           self.config['seed_per_file'], seedType)
        except Exception as e:
            traceback.print_stack()
            self.logger.error(str(e))