def get_priors(): try: load_dotenv('/home/amanda/bigDisk/Twitter/creds/.env') username = os.getenv('DATABASE_USER') password = os.getenv('DATABASE_PASSWORD') conn_string = "dbname='twitter' user="******" password = "******"got filenames" for file_path, file_name in file_names: try: f = json.load(open(file_path + '/' + file_name, 'r')) except: print file_path print file_name continue if len(f) > 150: user_id = file_name.split('_')[0] cur.execute( 'SELECT user_info_json FROM followers WHERE user_id = %s', (user_id, )) record = cur.fetchone() if record: if record[0]: user_info = ast.literal_eval(record[0]) else: continue gf = GetFeatures(user_id, user_info, f) gf.get_user_features() gf.collect_tweets() gf.get_content_features() gf.get_temporal_features() features['temporal'].append(gf.temporal_features) features['content'].append(gf.content_features) features['network'].append(gf.network_features) features['user'].append(gf.user_features) pickle.dump(features, open('priors_feature_list.p', 'wb'))
def find_bots(self, priors): self.users_to_query = set() features = {'temporal': {}, 'content': {}, 'user': {}, 'network': {}} followers_set = set(self.followers) if self.level > 0: print "Number of followers: " + str(len(self.followers)) follower_counts = Counter(self.followers).most_common() # should fix this to be a more precise measure size_to_keep = int(.08*len(self.followers)) connectedness_threshold = floor(0.3*self.n) print size_to_keep print connectedness_threshold tmp_followers = [f[0] for f in follower_counts if f[1] > connectedness_threshold] print "NUmber of followers over threshold = " + str(len(tmp_followers)) if len(tmp_followers) < size_to_keep: tmp_followers.extend([f[0] for f in follower_counts[:size_to_keep] if f[1] > 1]) followers_set = set(tmp_followers) print "Number of connected followers: " + str(len(followers_set)) print "Getting all user info..." for follower in followers_set: user_info = None follower = str(follower) if follower not in self.users and follower not in self.ignore_users: self.cur.execute('SELECT suspended, deleted, other_error, user_info, user_info_json FROM followers WHERE user_id = %s', (follower,)) record = self.cur.fetchone() if record: if record[0] or record[1] or record[2]: self.ignore_users.add(follower) continue if record[3] and not record[4]: self.ignore_users.add(follower) continue if record[3] and record[4]: try: self.user_info[follower] = ast.literal_eval(record[4]) continue except: self.ignore_users.add(follower) continue self.users_to_query.add(follower) get_user_info(self) print "Getting all timeline info and extracting features" for follower in followers_set: timeline = None follower = str(follower) if follower not in self.users and follower not in self.ignore_users: self.users.add(follower) self.cur.execute('SELECT suspended, deleted, other_error, timeline FROM followers WHERE user_id = %s', (follower,)) record = self.cur.fetchone() if record: if record[0] or record[1] or record[2]: self.ignore_users.add(follower) # print "User is suspended or deleted" continue if record[3]: # print "Already have timeline information for user number " + follower # Have to read in file to get timeline info timeline = get_timeline_from_file(self, follower) else: timeline = get_user_timeline(self, follower) else: timeline = get_user_timeline(self, follower) if timeline and self.user_info.get(follower) and len(timeline) > 50: gf = GetFeatures(follower, self.user_info[follower], timeline) try: gf.get_user_features() gf.collect_tweets() gf.get_content_features() gf.get_temporal_features() features['temporal'][follower] = gf.temporal_features features['content'][follower] = gf.content_features features['network'][follower] = gf.network_features features['user'][follower] = gf.user_features self.current_level_users.append(follower) except Exception as e: print "ERROR GETTING FEATURES" print e print follower print self.user_info[follower] # need to incorporate other network features #gf.features['num_shared_edges'] = follower_counts[user] # we can look at the out-degree of the collapsed ego network. We also calculate the average out degree, # which is the average number of followers per follower. # need to get the followers for all these with open('clique_expansion/' + self.seed_user + '_all_features.p', 'wb') as f: pickle.dump(features, f) len_priors = len(priors['temporal']) current_features = priors current_features['temporal'].extend(features['temporal'].values()) current_features['content'].extend(features['content'].values()) current_features['network'].extend(features['network'].values()) current_features['user'].extend(features['user'].values()) print "Performing anomaly detection" X = dict() X['temporal'] = self.vec.fit_transform(current_features['temporal']).toarray() X['content'] = self.vec.fit_transform(current_features['content']).toarray() X['network'] = self.vec.fit_transform(current_features['network']).toarray() X['user'] = self.vec.fit_transform(current_features['user']).toarray() current_features = dict() for key, value in X.iteritems(): X[key] = normalize(value) outliers = self.perform_outlier_detection(X, len_priors) self.level += 1 self.clique_features = {'temporal': {}, 'content': {}, 'user': {}, 'network': {}} for follower in outliers: self.clique.add((follower, self.level)) self.to_check.add(follower) self.clique_features['content'][follower] = features['content'][follower] self.clique_features['network'][follower] = features['network'][follower] self.clique_features['user'][follower] = features['user'][follower] self.clique_features['temporal'][follower] = features['temporal'][follower] features = dict() print self.clique self.n = float(len(self.clique)) print "Current size of cluster: " + str(self.n)
def save_features(self): self.users_to_query = set() features = {'temporal': {}, 'content': {}, 'user': {}, 'network': {}} followers_set = set(self.followers) if self.level > 0: print "Number of followers: " + str(len(self.followers)) follower_counts = Counter(self.followers).most_common() # should fix this to be a more precise measure size_to_keep = int(.08 * len(self.followers)) connectedness_threshold = floor(0.3 * self.n) print size_to_keep print connectedness_threshold tmp_followers = [ f[0] for f in follower_counts if f[1] > connectedness_threshold ] print "NUmber of followers over threshold = " + str( len(tmp_followers)) if len(tmp_followers) < size_to_keep: tmp_followers.extend( [f[0] for f in follower_counts[:size_to_keep] if f[1] > 1]) followers_set = set(tmp_followers) print "Number of connected followers: " + str(len(followers_set)) print "Getting all user info..." for follower in followers_set: user_info = None follower = str(follower) if follower not in self.users and follower not in self.ignore_users: self.cur.execute( 'SELECT suspended, deleted, other_error, user_info, user_info_json FROM followers WHERE user_id = %s', (follower, )) record = self.cur.fetchone() if record: if record[0] or record[1] or record[2]: self.ignore_users.add(follower) continue if record[3] and not record[4]: self.ignore_users.add(follower) continue if record[3] and record[4]: try: self.user_info[follower] = ast.literal_eval( record[4]) continue except: self.ignore_users.add(follower) continue self.users_to_query.add(follower) get_user_info(self) print "Getting all timeline info and extracting features" for follower in followers_set: timeline = None follower = str(follower) if follower not in self.users and follower not in self.ignore_users: self.users.add(follower) self.cur.execute( 'SELECT suspended, deleted, other_error, timeline FROM followers WHERE user_id = %s', (follower, )) record = self.cur.fetchone() if record: if record[0] or record[1] or record[2]: self.ignore_users.add(follower) # print "User is suspended or deleted" continue if record[3]: # print "Already have timeline information for user number " + follower # Have to read in file to get timeline info timeline = get_timeline_from_file(self, follower) else: timeline = get_user_timeline(self, follower) else: timeline = get_user_timeline(self, follower) if timeline and self.user_info.get( follower) and len(timeline) > 50: gf = GetFeatures(follower, self.user_info[follower], timeline) try: gf.get_user_features() gf.collect_tweets() gf.get_content_features() gf.get_temporal_features() features['temporal'][follower] = gf.temporal_features features['content'][follower] = gf.content_features features['network'][follower] = gf.network_features features['user'][follower] = gf.user_features self.current_level_users.append(follower) except Exception as e: print "ERROR GETTING FEATURES" print e print follower print self.user_info[follower] with open( 'clique_expansion/all_features/' + self.seed_user + '_all_features.p', 'wb') as f: pickle.dump(features, f)
def save_features(self): self.users_to_query = set() features = {'temporal': {}, 'content': {}, 'user': {}, 'network': {}} followers_set = set(self.followers) print "Getting all user info..." for follower in followers_set: follower = str(follower) if follower not in self.ignore_users: self.cur.execute( 'SELECT suspended, deleted, other_error, user_info, user_info_json FROM followers WHERE user_id = %s', (follower, )) record = self.cur.fetchone() if record: if record[0] or record[1] or record[2]: self.ignore_users.add(follower) continue if record[3] and not record[4]: self.ignore_users.add(follower) continue if record[3] and record[4]: try: self.user_info[follower] = ast.literal_eval( record[4]) continue except Exception as e: print e self.ignore_users.add(follower) continue self.users_to_query.add(follower) get_user_info(self) print "Getting all timeline info and extracting features" for follower in followers_set: timeline = None follower = str(follower) if follower not in self.ignore_users: self.cur.execute( 'SELECT timeline FROM followers WHERE user_id = %s', (follower, )) record = self.cur.fetchone() if record: if record[0]: # print "Already have timeline information for user number " + follower # Have to read in file to get timeline info timeline = get_timeline_from_file(self, follower) else: timeline = get_user_timeline(self, follower) else: timeline = get_user_timeline(self, follower) if timeline and self.user_info.get( follower) and len(timeline) > 150: gf = GetFeatures(follower, self.user_info[follower], timeline) try: gf.get_user_features() gf.collect_tweets() gf.get_content_features() gf.get_temporal_features() features['temporal'][follower] = gf.temporal_features features['content'][follower] = gf.content_features features['network'][follower] = gf.network_features features['user'][follower] = gf.user_features except Exception as e: print "ERROR GETTING FEATURES" print e print follower print self.user_info[follower] with open( 'clique_expansion/all_features/' + self.seed_user + '_all_features.p', 'wb') as f: pickle.dump(features, f)