def get_priors():
    try:
        load_dotenv('/home/amanda/bigDisk/Twitter/creds/.env')
        username = os.getenv('DATABASE_USER')
        password = os.getenv('DATABASE_PASSWORD')
        conn_string = "dbname='twitter' user="******" password = "******"got filenames"
    for file_path, file_name in file_names:
        try:
            f = json.load(open(file_path + '/' + file_name, 'r'))
        except:
            print file_path
            print file_name
            continue
        if len(f) > 150:
            user_id = file_name.split('_')[0]
            cur.execute(
                'SELECT user_info_json FROM followers WHERE user_id = %s',
                (user_id, ))
            record = cur.fetchone()
            if record:
                if record[0]:
                    user_info = ast.literal_eval(record[0])
                else:
                    continue
                gf = GetFeatures(user_id, user_info, f)
                gf.get_user_features()
                gf.collect_tweets()
                gf.get_content_features()
                gf.get_temporal_features()
                features['temporal'].append(gf.temporal_features)
                features['content'].append(gf.content_features)
                features['network'].append(gf.network_features)
                features['user'].append(gf.user_features)
    pickle.dump(features, open('priors_feature_list.p', 'wb'))
예제 #2
0
    def find_bots(self, priors):
        self.users_to_query = set()
        features = {'temporal': {}, 'content': {}, 'user': {}, 'network': {}}
        followers_set = set(self.followers)
        if self.level > 0:
            print "Number of followers: " + str(len(self.followers))
            follower_counts = Counter(self.followers).most_common()
            # should fix this to be a more precise measure
            size_to_keep = int(.08*len(self.followers))
            connectedness_threshold = floor(0.3*self.n)
            print size_to_keep
            print connectedness_threshold
            tmp_followers = [f[0] for f in follower_counts if f[1] > connectedness_threshold]
            print "NUmber of followers over threshold = " + str(len(tmp_followers))
            if len(tmp_followers) < size_to_keep:
                tmp_followers.extend([f[0] for f in follower_counts[:size_to_keep] if f[1] > 1])
            followers_set = set(tmp_followers)
            print "Number of connected followers: " + str(len(followers_set))
        print "Getting all user info..."
        for follower in followers_set:
            user_info = None
            follower = str(follower)
            if follower not in self.users and follower not in self.ignore_users:
                self.cur.execute('SELECT suspended, deleted, other_error, user_info, user_info_json FROM followers WHERE user_id = %s', (follower,))
                record = self.cur.fetchone()
                if record:
                    if record[0] or record[1] or record[2]:
                        self.ignore_users.add(follower)
                        continue
                    if record[3] and not record[4]:
                        self.ignore_users.add(follower)
                        continue
                    if record[3] and record[4]:
                        try:
                            self.user_info[follower] = ast.literal_eval(record[4])
                            continue
                        except:
                            self.ignore_users.add(follower)
                            continue
                self.users_to_query.add(follower)
        get_user_info(self)
        print "Getting all timeline info and extracting features"
        for follower in followers_set:
            timeline = None
            follower = str(follower)
            if follower not in self.users and follower not in self.ignore_users:
                self.users.add(follower)
                self.cur.execute('SELECT suspended, deleted, other_error, timeline FROM followers WHERE user_id = %s', (follower,))
                record = self.cur.fetchone()
                if record:
                    if record[0] or record[1] or record[2]:
                        self.ignore_users.add(follower)
                        # print "User is suspended or deleted"
                        continue
                    if record[3]:
                        # print "Already have timeline information for user number " + follower
                        # Have to read in file to get timeline info
                        timeline = get_timeline_from_file(self, follower)
                    else:
                        timeline = get_user_timeline(self, follower)
                else:
                    timeline = get_user_timeline(self, follower)
                if timeline and self.user_info.get(follower) and len(timeline) > 50:
                    gf = GetFeatures(follower, self.user_info[follower], timeline)
                    try:
                        gf.get_user_features()
                        gf.collect_tweets()
                        gf.get_content_features()
                        gf.get_temporal_features()
                        features['temporal'][follower] = gf.temporal_features
                        features['content'][follower] = gf.content_features
                        features['network'][follower] = gf.network_features
                        features['user'][follower] = gf.user_features
                        self.current_level_users.append(follower)
                    except Exception as e:
                        print "ERROR GETTING FEATURES"
                        print e
                        print follower
                        print self.user_info[follower]
                    # need to incorporate other network features
                    #gf.features['num_shared_edges'] = follower_counts[user]

        # we can look at the out-degree of the collapsed ego network. We also calculate the average out degree,
        # which is the average number of followers per follower.
        # need to get the followers for all these
        with open('clique_expansion/' + self.seed_user + '_all_features.p', 'wb') as f:
                pickle.dump(features, f)
        len_priors = len(priors['temporal'])
        current_features = priors
        current_features['temporal'].extend(features['temporal'].values())
        current_features['content'].extend(features['content'].values())
        current_features['network'].extend(features['network'].values())
        current_features['user'].extend(features['user'].values())
        print "Performing anomaly detection"
        X = dict()
        X['temporal'] = self.vec.fit_transform(current_features['temporal']).toarray()
        X['content'] = self.vec.fit_transform(current_features['content']).toarray()
        X['network'] = self.vec.fit_transform(current_features['network']).toarray()
        X['user'] = self.vec.fit_transform(current_features['user']).toarray()
        current_features = dict()
        for key, value in X.iteritems():
            X[key] = normalize(value)

        outliers = self.perform_outlier_detection(X, len_priors)

        self.level += 1
        self.clique_features = {'temporal': {}, 'content': {}, 'user': {}, 'network': {}}
        for follower in outliers:
            self.clique.add((follower, self.level))
            self.to_check.add(follower)
            self.clique_features['content'][follower] = features['content'][follower]
            self.clique_features['network'][follower] = features['network'][follower]
            self.clique_features['user'][follower] = features['user'][follower]
            self.clique_features['temporal'][follower] = features['temporal'][follower]
        features = dict()
        print self.clique
        self.n = float(len(self.clique))
        print "Current size of cluster: " + str(self.n)
 def save_features(self):
     self.users_to_query = set()
     features = {'temporal': {}, 'content': {}, 'user': {}, 'network': {}}
     followers_set = set(self.followers)
     if self.level > 0:
         print "Number of followers: " + str(len(self.followers))
         follower_counts = Counter(self.followers).most_common()
         # should fix this to be a more precise measure
         size_to_keep = int(.08 * len(self.followers))
         connectedness_threshold = floor(0.3 * self.n)
         print size_to_keep
         print connectedness_threshold
         tmp_followers = [
             f[0] for f in follower_counts if f[1] > connectedness_threshold
         ]
         print "NUmber of followers over threshold = " + str(
             len(tmp_followers))
         if len(tmp_followers) < size_to_keep:
             tmp_followers.extend(
                 [f[0] for f in follower_counts[:size_to_keep] if f[1] > 1])
         followers_set = set(tmp_followers)
         print "Number of connected followers: " + str(len(followers_set))
     print "Getting all user info..."
     for follower in followers_set:
         user_info = None
         follower = str(follower)
         if follower not in self.users and follower not in self.ignore_users:
             self.cur.execute(
                 'SELECT suspended, deleted, other_error, user_info, user_info_json FROM followers WHERE user_id = %s',
                 (follower, ))
             record = self.cur.fetchone()
             if record:
                 if record[0] or record[1] or record[2]:
                     self.ignore_users.add(follower)
                     continue
                 if record[3] and not record[4]:
                     self.ignore_users.add(follower)
                     continue
                 if record[3] and record[4]:
                     try:
                         self.user_info[follower] = ast.literal_eval(
                             record[4])
                         continue
                     except:
                         self.ignore_users.add(follower)
                         continue
             self.users_to_query.add(follower)
     get_user_info(self)
     print "Getting all timeline info and extracting features"
     for follower in followers_set:
         timeline = None
         follower = str(follower)
         if follower not in self.users and follower not in self.ignore_users:
             self.users.add(follower)
             self.cur.execute(
                 'SELECT suspended, deleted, other_error, timeline FROM followers WHERE user_id = %s',
                 (follower, ))
             record = self.cur.fetchone()
             if record:
                 if record[0] or record[1] or record[2]:
                     self.ignore_users.add(follower)
                     # print "User is suspended or deleted"
                     continue
                 if record[3]:
                     # print "Already have timeline information for user number " + follower
                     # Have to read in file to get timeline info
                     timeline = get_timeline_from_file(self, follower)
                 else:
                     timeline = get_user_timeline(self, follower)
             else:
                 timeline = get_user_timeline(self, follower)
             if timeline and self.user_info.get(
                     follower) and len(timeline) > 50:
                 gf = GetFeatures(follower, self.user_info[follower],
                                  timeline)
                 try:
                     gf.get_user_features()
                     gf.collect_tweets()
                     gf.get_content_features()
                     gf.get_temporal_features()
                     features['temporal'][follower] = gf.temporal_features
                     features['content'][follower] = gf.content_features
                     features['network'][follower] = gf.network_features
                     features['user'][follower] = gf.user_features
                     self.current_level_users.append(follower)
                 except Exception as e:
                     print "ERROR GETTING FEATURES"
                     print e
                     print follower
                     print self.user_info[follower]
     with open(
             'clique_expansion/all_features/' + self.seed_user +
             '_all_features.p', 'wb') as f:
         pickle.dump(features, f)
예제 #4
0
 def save_features(self):
     self.users_to_query = set()
     features = {'temporal': {}, 'content': {}, 'user': {}, 'network': {}}
     followers_set = set(self.followers)
     print "Getting all user info..."
     for follower in followers_set:
         follower = str(follower)
         if follower not in self.ignore_users:
             self.cur.execute(
                 'SELECT suspended, deleted, other_error, user_info, user_info_json FROM followers WHERE user_id = %s',
                 (follower, ))
             record = self.cur.fetchone()
             if record:
                 if record[0] or record[1] or record[2]:
                     self.ignore_users.add(follower)
                     continue
                 if record[3] and not record[4]:
                     self.ignore_users.add(follower)
                     continue
                 if record[3] and record[4]:
                     try:
                         self.user_info[follower] = ast.literal_eval(
                             record[4])
                         continue
                     except Exception as e:
                         print e
                         self.ignore_users.add(follower)
                         continue
             self.users_to_query.add(follower)
     get_user_info(self)
     print "Getting all timeline info and extracting features"
     for follower in followers_set:
         timeline = None
         follower = str(follower)
         if follower not in self.ignore_users:
             self.cur.execute(
                 'SELECT timeline FROM followers WHERE user_id = %s',
                 (follower, ))
             record = self.cur.fetchone()
             if record:
                 if record[0]:
                     # print "Already have timeline information for user number " + follower
                     # Have to read in file to get timeline info
                     timeline = get_timeline_from_file(self, follower)
                 else:
                     timeline = get_user_timeline(self, follower)
             else:
                 timeline = get_user_timeline(self, follower)
             if timeline and self.user_info.get(
                     follower) and len(timeline) > 150:
                 gf = GetFeatures(follower, self.user_info[follower],
                                  timeline)
                 try:
                     gf.get_user_features()
                     gf.collect_tweets()
                     gf.get_content_features()
                     gf.get_temporal_features()
                     features['temporal'][follower] = gf.temporal_features
                     features['content'][follower] = gf.content_features
                     features['network'][follower] = gf.network_features
                     features['user'][follower] = gf.user_features
                 except Exception as e:
                     print "ERROR GETTING FEATURES"
                     print e
                     print follower
                     print self.user_info[follower]
     with open(
             'clique_expansion/all_features/' + self.seed_user +
             '_all_features.p', 'wb') as f:
         pickle.dump(features, f)