def get_tweet_from_user(self, userlist, field='id'): # return a list of tweets made by users in userlist duplicateFinder = dupefilter.DupInfo(prefix=self.prefix) return duplicateFinder.get_tweet(collection=util.loadjson( self.sourcefile), userlist=userlist, field=field)
def print_metadata(self): # return metadata of one collection # including total number of tweets, total duplicateFinder = dupefilter.DupInfo(prefix=self.prefix) data = duplicateFinder.get_metadata() print data['num_tweet'] print data['end_time'] - data['start_time'] print data['start_time']
def get_suspicious_user_group(self, startover=False, filter_function=all_groups): duplicateFinder = dupefilter.DupInfo(prefix=self.prefix, startover=startover) duplicateFinder.find_duplicate_tweet( collection=util.loadjson(self.sourcefile), collect_url_only=self.collect_url_only) botty_groups = duplicateFinder.get_suspicious_user_group( filter_function=filter_function, url_based=self.url_based) return botty_groups
def get_percent_of_spam(self): spam_user = self.get_spam_user_info(variable='spam_user') duplicateFinder = dupefilter.DupInfo(prefix=self.prefix) total_user = duplicateFinder.get_metadata(variable='num_user') print 'Start time is ', duplicateFinder.get_metadata( variable='start_time') print 'End time is ', duplicateFinder.get_metadata(variable='end_time') print 'Total number of account is %d' % (total_user) print 'Total number of spam account is %d' % (len(spam_user)) print 'Percent of spam account is %f' % (float(len(spam_user)) / float(total_user)) total_tweet = duplicateFinder.get_metadata(variable='num_tweet') print 'Total number of tweets is %d' % (total_tweet) num_spam_tweet = duplicateFinder.get_tweet(collection=util.loadjson( self.sourcefile), userlist=spam_user, only_number=True) print 'Total number of spam tweets is %d' % (num_spam_tweet) print 'Percent of spam tweets is %f' % (float(num_spam_tweet) / float(total_tweet))
def save_user_info(self): duplicateFinder = dupefilter.DupInfo(prefix=self.prefix) duplicateFinder.save_user_info( collection=util.loadjson(self.sourcefile))
def get_url_per_user(self): duplicateFinder = dupefilter.DupInfo(prefix=self.prefix) return duplicateFinder.get_url_per_user( collection=util.loadjson(self.sourcefile))
def get_num_tweet(self): # return total number of tweets in one collection duplicateFinder = dupefilter.DupInfo(prefix=self.prefix) num_tweet = duplicateFinder.get_metadata(variable='num_tweet') return num_tweet