def check_debot_api(prefix): full_prefix = util.get_full_prefix(prefix) spam_group = load_user_all(full_prefix) debot_result = {} count = 0 unique_user = set([]) for name, group in spam_group.iteritems(): print name print user = group['screen_name'] print user count += 1 if user in unique_user: continue else: unique_user.add(user) mydata = [('screen_name', '@' + user)] mydata = urllib.urlencode(mydata) req = urllib2.Request(path, mydata) req.add_header("Content-type", "application/x-www-form-urlencoded") page = urllib2.urlopen(req).read() if 'This account has not been detected by DeBot' in page: print 'nobot' debot_result[user] = 'nobot' else: print 'isbot' debot_result[user] = 'isbot' json.dump( debot_result, open(DEBOT_DIR + 'debot_' + prefix + '_all_user_score.json', 'w'))
def user_info_nine_urls(): y = [] x = [] for keyword, KEYWORD in [('bit', ['bit ly']), ('tinyurl', ['tinyurl']), ('goo', ['goo gl']), ('dld', ["dld bz"]), ('ift', ["ift tt"]), ('dlvr', ['dlvr it']), ('ow', ['ow ly']), ('lnis', ['ln is']), ('viid', ['viid'])]: # keyword = keyword + "_long" prefix = util.get_full_prefix(keyword) #streamer.collect(keyword=KEYWORD, filename=util.get_full_src_path(prefix), num_tweets=NUM_TWEETS, duration = 43200) detector = SpamDetector(prefix=prefix, url_based=True, collect_url_only=False) # pprint.pprint(detector.get_user_info()) group = detector.get_spam_group() user_info = detector.get_user_info() # pprint.pprint(group) # let's use two for now: top language, and tweet variability from collections import Counter for index, g in enumerate(group.keys()): user_infos = [user_info[str(u)] for u in group[g]['spam_user']] std = np.std([u["statuses_count"] for u in user_infos]) mean = np.mean([u["statuses_count"] for u in user_infos]) top_language = Counter([u["lang"] for u in user_infos]).most_common(1)[0][0] x.append([std / mean, top_language]) #return group[g]['spam_user'], user_info y.append(keyword + str(index + 1)) print x, y
def load_sample_user(group='mybot'): import urlparse dic = json.load(open(DEBOT_DIR + 'debot_mybot_total_news_url.json', 'r')) prefix = 'bit' full_prefix = util.get_full_prefix(prefix) my_bot = load_user_screenname(full_prefix) print '%s news result' % (group) # those four URLs are not part of news bots, but generic URLs exclude = ['twitter.com', 'fb.me', 'www.youtube.com', 'youtu.be'] count = 0 total = 0 precents = [] for u, v in dic[group].iteritems(): if u not in my_bot and group == 'mybot': continue total += 1 have_news = False num_news = 0 for url in v: if urlparse.urlparse( url).netloc in crawler.whitelist and urlparse.urlparse( url).netloc not in exclude: num_news += 1 have_news = True if have_news: count += 1 precents.append(num_news * 1.0 / len(v)) print 'total num accounts %d' % (total) print 'num news accounts %d' % (count) print 'avg percent of news tweets %f' % (sum(precents) / len(precents))
def parameter_sweeping_plot_miniplot(): all_range = [] all_y = [] all_OX = [] all_np = [] titles = [] for key in sorted( ['bit', 'tinyurl', 'lnis', 'viid', 'goo', 'dld', 'ift', 'dlvr', 'ow']): PREFIX = util.get_full_prefix(key) detector = SpamDetector(prefix=PREFIX, url_based=True) #result = detector.parameter_sweeping_plot(min_duplicate_factor = 3, return_all = True) result = detector.parameter_sweeping_plot(percent_same=0.6, return_all=True) titles.append(key) all_range.append(result[0]) all_y.append(result[1]) all_OX.append(result[2]) all_np.append(result[3]) timeline_new.plot_xybar_miniplot( all_range, all_y, all_OX, all_np, titles, xlabel='percent same', ylabel='number of spam user', filename= 'parameter_sweeping/parameter_sweeping_plot_all_URL_shorteners_min_dup_factor' ) exit()
def helper_print_metadata(): for key in ['viid', 'goo', 'bit', 'dld', 'ift', 'dlvr']: PREFIX = util.get_full_prefix(key) detector = SpamDetector(prefix=PREFIX, url_based=True) #detector.print_metadata() #pprint.pprint(detector.get_suspicious_user_group(startover = False, filter_function = url_detect)) detector.save_user_info() exit()
def load_user(prefix, group_id): full_prefix = util.get_full_prefix(prefix) detector = detect.SpamDetector(prefix=full_prefix) group = detector.get_spam_group() user_info = detector.get_user_info() #alluser = set([]) id_count = 1 for g in group: if id_count == group_id: return group[g]['spam_user'], user_info id_count += 1
def store_num_tweet_per_user_json(): #dic = json.load(open('spam_category.json', 'r')) dic = {} for key in sorted( ['bit', 'tinyurl', 'lnis', 'viid', 'goo', 'dld', 'ift', 'dlvr', 'ow']): PREFIX = util.get_full_prefix(key) detector = SpamDetector(prefix=PREFIX, url_based=True) dic[key] = detector.get_tweet_per_user() json.dump(dic, open('metadata/user_num_tweet_all_URL_shorteners.json', 'w')) exit()
def compare_score_all_user(prefix, return_debot_only=False, return_mybot_only=False): debot_result = json.load( open(DEBOT_DIR + 'debot_' + prefix + '_all_user_score.json', 'r')) debot_bot = set([ user for user, status in debot_result.iteritems() if status == 'isbot' ]) full_prefix = util.get_full_prefix(prefix) my_bot = load_user_screenname(full_prefix) print 'num bots for debot %d' % (len(debot_bot)) print 'num bots for our method %d' % (len(my_bot)) print 'intersection is %d' % (len(debot_bot.intersection(my_bot))) print 'percent our bot in the intersection is %f' % ( 1.0 * len(debot_bot.intersection(my_bot)) / len(my_bot)) print print 'num bots in debot,not in ours %d' % (len( debot_bot.difference(my_bot))) print 'num bots in ours,not in debot %d' % (len( my_bot.difference(debot_bot))) print 'Closer analysis of bots identified by debot...' debot_only = debot_bot.difference(my_bot) if return_debot_only: return debot_only if return_mybot_only: return my_bot.difference(debot_bot) user_info = json.load(open(full_prefix + 'user_info.json', 'r')) user_info_dic = {} for u, v in user_info.iteritems(): user_info_dic[v['screen_name']] = v count = 0 for u in debot_only: #pprint.pprint(user_info_dic[u]['screen_name']) if user_info_dic[u]['verified']: count += 1 #print user_info_dic[u]['screen_name'] print 'debot num verified ', count #print len(my_bot.intersection(user_info_dic.keys())) print count = 0 for u in my_bot: if u in user_info_dic: if user_info_dic[u]['verified']: count += 1 #print user_info_dic[u] print 'mybot num verified ', count print
def load_user(prefix, group_id): full_prefix = util.get_full_prefix(prefix) detector = detect.SpamDetector(prefix=full_prefix) group = detector.get_spam_group() user_info = detector.get_user_info() #alluser = set([]) #print '[IN load_user], group_id is %d' %(group_id) #print '[IN load_user], length of group is %d' %(len(group)) id_count = 1 for g in group: #print 'current group_id is %d' %(id_count) if id_count == group_id: return group[g]['spam_user'], user_info id_count += 1
def run_long_experiment(): for keyword, KEYWORD in [('bitly', ['bit ly']), ('tinyurl', ['tinyurl']), ('goo', ['goo gl']), ('dld', ["dld bz"]), ('ift', ["ift tt"]), ('dlvr', ['dlvr it']), ('ow', ['ow ly']), ('lnis', ['ln is']), ('viid', ['viid'])]: keyword = keyword + "_long" prefix = util.get_full_prefix(keyword) NUM_TWEETS = 500000 #streamer.collect(keyword=KEYWORD, filename=util.get_full_src_path(prefix), num_tweets=NUM_TWEETS, duration = 43200) detector = SpamDetector(prefix=prefix, url_based=True, collect_url_only=False) detector.get_percent_of_spam()
def get_connectivity(prefix, group_id): userlist, user_info = load_user(prefix, group_id) filename = util.get_full_prefix(prefix) + "group_" + str( group_id) + "_user_followers_dic.json" myFollowerFinder = follower.FollowerFinder(prefix=prefix, userlist=userlist) """generate and save json file""" TYPE = "undirected" SAVE_DIR = "gephi/" g = Graph(myFollowerFinder.load_file(filename=filename), TYPE) g.build_graph() g.add_screenname(user_info) print len(g.get_graph().nodes()) print approx.node_connectivity(g.get_graph()) dic = degree_alg.degree_centrality(g.get_graph()) print sum(dic.values()) / len(dic) return sum(dic.values()) / len(dic)
def load_user(prefix): full_prefix = util.get_full_prefix(prefix) detector = detect.SpamDetector(prefix=full_prefix) group = detector.get_spam_group() """ Run those two lines of code of url info file does not exist url_info = detector.get_url_per_user() json.dump(url_info, open('metadata/'+prefix+'_user_url_dictionary.json','w')) """ url_info = json.load( open('metadata/' + prefix + '_user_url_dictionary.json', 'r')) print len(url_info) #alluser = set([]) # id_count = 1 for index, g in enumerate(group): unique_url = set([]) for user in group[g]['spam_user']: for url in url_info[str(user)]: if prefix in url: unique_url.add(url) else: if 'twitter.com' in url: if url in cache: print 'find url in cache' unique_url.add(cache[url]) else: try: print url new_url = extract_url_from_twitter_page( url, prefix) if new_url: unique_url.add(new_url) except Exception, e: print e time.sleep(2) #unique_url = unique_url.union(set(url_info[str(user)])) #print len(unique_url) #pprint.pprint(unique_url) group[g]['unique_url'] = list(unique_url)
def get_and_store_status(filename): if os.path.isfile(filename): print 'file exists' return dic = {} #['bit', 'tinyurl', 'lnis', 'viid', 'goo', 'dld', 'ift', 'dlvr', 'ow'] for prefix in [ 'bit', 'tinyurl', 'lnis', 'viid', 'goo', 'dld', 'ift', 'dlvr', 'ow' ]: print prefix full_prefix = util.get_full_prefix(prefix) detector = detect.SpamDetector(prefix=full_prefix) user = detector.get_spam_user_info(variable='screen_name') #user = ['WuerzRodrigo', 'reed_schepens'] #user = ['InceZehraince3', 'noexistingasdf123', 'zhouhanchen', 'NBA76ersFans'] #user = list(user)[:2] print len(user) dic[prefix] = check_status(user) json.dump(dic, open(filename, 'w'))
def get_social_network(prefix, group_id): userlist, user_info = load_user(prefix, group_id) filename = util.get_full_prefix(prefix) + "group_" + str( group_id) + "_user_followers_dic.json" myFollowerFinder = FollowerFinder(prefix=prefix, userlist=userlist) # if filename does not exist, call Twitter API to collect data if not os.path.isfile(filename): myFollowerFinder.getFollowers(userlist, filename=filename) #myFollowerFinder.check_common_user(filename = filename) """generate and save json file""" TYPE = "undirected" SAVE_DIR = "gephi/" print myFollowerFinder.load_file(filename=filename) exit() g = social_network.Graph(myFollowerFinder.load_file(filename=filename), TYPE) g.build_graph() g.add_screenname(user_info) #g.delete_singleton() g.generatejsonfile(SAVE_DIR + prefix + "_bot_group_" + str(group_id))
def sample_user(): # sample of debot only [u'melanieviveros9', u'RatanSharda55', u'KevinMcshea', u'imchrismva', u'Phaedrus08'] import random import urlparse #debot_only = compare_score_all_user('bit', return_mybot_only = True) #samples = random.sample(list(debot_only), 5) #print samples prefix = 'bit' full_prefix = util.get_full_prefix(prefix) myUserCrawler = crawler.UserCrawler(simplecrawl=True) debot_result = json.load( open(DEBOT_DIR + 'debot_' + prefix + '_all_user_score.json', 'r')) debot_bot = set([ user for user, status in debot_result.iteritems() if status == 'isbot' ]) my_bot = load_user_screenname_custom(full_prefix) data = {'debot': list(debot_bot), 'mybot': list(my_bot)} final_result = {'debot': {}, 'mybot': {}} for name, userlist in data.iteritems(): print name for u in userlist: final_result[name][u] = [] try: result = myUserCrawler.get200(u, use_screen_name=True, return_error_code=False) crawler.tokenindex += 1 crawler.tokenindex = crawler.tokenindex % crawler.ROUND time.sleep(0.2) for t in result: final_result[name][u] += streamer.get_embedded_url(t) except Exception, e: print e myUserCrawler = crawler.UserCrawler(simplecrawl=True) time.sleep(20)
""" Be careful when calling crawler on existing duplifliers because of update of filename from actual text to numerical number group_1, group_2, ..., ... """ #helper_print_metadata() #parameter_sweeping_plot_miniplot() #get_spam_group_num_tweet() """ input variables: keyword, DATA_DIR (defined at very top) """ # default keyword keyword = 'git_test' prefix = util.get_full_prefix(keyword) if len(sys.argv) > 1: keyword = sys.argv[1] #SOURCE_FILE = DATA_DIR + PREFIX[:PREFIX.index('/')] + '.txt' #SOURCE_FILE = util.slice_data(2014, 10, 10) """ variables for streaming: (optional) KEYWORD: a list of keywords num_tweets: number of tweets to collect """ KEYWORD = ['bit'] NUM_TWEETS = 1000 """ start the streamer first (optional if the dataset already exists) """