def index(): query = dict(request.get_json()) users = user_objects.user_set() users.make_new_users(query) users.push_to_community() return ' 🐱💻'
def index(): query = dict(request.get_json()) mongo_app = connections.mongo() mongo_app.connect_to_mongo() all_users = user_objects.user_set() all_users.get_users({}) #making empty graph g = ig.Graph(directed=True) #making vertices for user in all_users.users: g.add_vertex(name=user.user['screen_name'], size=user.activity) #making edges for user in all_users.users: for edge in user.edges: if edge['bond'] != 0: g.add_edge(user.user['screen_name'], edge['screen_name'], weight=edge['bond']) #clustering people cluster = g.community_infomap(edge_weights=g.es['weight'], vertex_weights=g.vs['size'], trials=1000) cluster_number = 0 for graph in cluster.subgraphs(): for vertex in graph.vs: mongo_app.db.community.update( {'user.screen_name': vertex['name']}, {'$set': { 'cluster_number': cluster_number }}) cluster_number += 1 if mongo_app.db.reports.find({'report': 'cluster_community'}).count() > 0: mongo_app.db.reports.update({'report': 'cluster_community'}, { '$set': { 'no_users': len(all_users.users), 'no_clusters': cluster_number } }) else: mongo_app.db.reports.insert({ 'report': 'cluster_community', 'no_users': len(all_users.users), 'no_clusters': cluster_number }) return ('done')
def show_clusters(self): label = 'Which Cluster would you like to see' self.cluster_number = st.slider(label, min_value=0, max_value=self.no_clusters - 1, value=0, step=1) if st.button('Show Cluster'): users_cluster = user_objects.user_set() users_cluster.get_users({'cluster_number': self.cluster_number}) cluster = [] for user in users_cluster.users: temp = {} temp['User'] = user.user['name'] temp['Screen name'] = user.user['screen_name'] temp['Activity'] = user.activity if user.bond_stats == None: temp['Tier 1'] = '' temp['Tier 1 interactions'] = 0 temp['Tier 2'] = '' temp['Tier 2 interactions'] = 0 temp['Tier 3'] = '' temp['Tier 3 interactions'] = 0 else: temp['Tier 1'] = user.bond_stats['tier1']['screen_name'] temp['Tier 1 interactions'] = user.bond_stats['tier1'][ 'interactions'] temp['Tier 2'] = user.bond_stats['tier2']['screen_name'] temp['Tier 2 interactions'] = user.bond_stats['tier2'][ 'interactions'] temp['Tier 3'] = user.bond_stats['tier3']['screen_name'] temp['Tier 3 interactions'] = user.bond_stats['tier3'][ 'interactions'] cluster.append(temp) df = pd.DataFrame(cluster) st.dataframe(df)
def index(): query = dict(request.get_json()) mongo_app = connections.mongo() mongo_app.connect_to_mongo() reports = [] for i in mongo_app.db.reports.find({'report': 'preprocess'}): reports.append(i) all_users = user_objects.user_set() all_users.get_users({}) results = [] results_count = len(all_users.users) for user in all_users.users: query = {} query['user'] = user.user['screen_name'] query['weights'] = { 'tweet_weight': reports[0]['tweet_weight'], 'tweet_mention_weight': reports[0]['tweet_mention_weight'], 'retweet_weight': reports[0]['retweet_weight'], 'quote_weight': reports[0]['quote_weight'], 'reply_weight': reports[0]['reply_weight'] } result = tasks.process.delay(query) t = random.uniform(0, 2) print(t) #only for debug purpose time.sleep(t) results.append(result) success_count = 0 while (success_count == results_count): success_count = 0 for result in results: if result.status == 'SUCCESS': success_count += 1 return 'done'
def discover(self, discover_query): mongo_app = connections.mongo() mongo_app.connect_to_mongo() cluster_number = discover_query['cluster_number'] vector_template = [] tweet_vectors = [] users_t1 = user_objects.user_set() query = {'cluster_number': cluster_number} users_t1.get_users(query) users_screen_name_t1 = [i.user['screen_name'] for i in users_t1.users] users_t2 = user_objects.user_set() query = {'cluster_number': {'$ne': cluster_number}} users_t2.get_users(query) users_screen_name_t2 = [i.user['screen_name'] for i in users_t2.users] #setting Dates today = datetime.date.today() seven_days_back = today - datetime.timedelta(days=7) tweets_t1 = tweet_objects.tweet_set() start = 0 for user in users_t1.users: if seven_days_back.year == today.year: query = { '$and': [{ 'user.id': user.user['id'] }, { '$and': [{ 'created_day': { '$gt': int(seven_days_back.strftime('%j')) } }, { 'created_year': seven_days_back.year }] }, { '$and': [{ 'created_day': { '$lte': int(today.strftime('%j')) } }, { 'created_year': today.year }] }] } tweets_t1.get_tweets(query) else: query = { '$and': [{ 'user.id': user.user['id'] }, { '$and': [{ 'created_day': { '$gt': int(seven_days_back.strftime('%j')) } }, { 'created_year': seven_days_back.year }] }] } tweets_t1.get_tweets(query) query = { '$and': [{ 'user.id': user.user['id'] }, { '$and': [{ 'created_day': { '$lte': int(today.strftime('%j')) } }, { 'created_year': today.year }] }] } tweets_t1.get_tweets(query) end = len(tweets_t1.tweets) tweets_t1.preprocess_tweets_range(start, end) ut1 = {} ut2 = {} ut3 = {} vector_template.append(user.user['screen_name']) for tweet in tweets_t1.tweets[start:end]: t1 = 0 t2 = 0 t3 = 0 for member in tweet.tweet_mentions: vector_template.append(member) if member in users_screen_name_t1: t1 += 1 if member not in ut1: ut1[member] = 1 else: ut1[member] += 1 elif member in users_screen_name_t2: t2 += 1 if member not in ut2: ut2[member] = 1 else: ut2[member] += 1 else: t3 += 1 if member not in ut3: ut3[member] = 1 else: ut3[member] += 1 t1 += 1 maxt = max(t1, t2, t3) if maxt == t1: tier = 1 elif maxt == t2: tier = 2 else: tier = 3 tweet.tweet['tier'] = tier if len(ut1) == 0: t1_screen_name = '' t1_interactions = 0 else: t1_screen_name = max(ut1) t1_interactions = ut1[max(ut1)] if len(ut2) == 0: t2_screen_name = '' t2_interactions = 0 else: t2_screen_name = max(ut2) t2_interactions = ut2[max(ut2)] if len(ut3) == 0: t3_screen_name = '' t3_interactions = 0 else: t3_screen_name = max(ut3) t3_interactions = ut3[max(ut3)] tier1 = { 'screen_name': t1_screen_name, 'interactions': t1_interactions } tier2 = { 'screen_name': t2_screen_name, 'interactions': t2_interactions } tier3 = { 'screen_name': t3_screen_name, 'interactions': t3_interactions } query = {'user.screen_name': user.user['screen_name']} payload = { '$set': { 'bond_stats': { 'tier1': tier1, 'tier2': tier2, 'tier3': tier3 } } } mongo_app.db.community.update(query, payload) start = end vector_template = list(set(vector_template)) for tweet in tweets_t1.tweets: vector = [] for member in vector_template: if ((member in tweet.tweet_mentions) or (member == tweet.tweet_user_screen_name)): time = 1 + float(tweet.tweet['created_day'] / 4) + float( 0.25 * tweet.tweet['tweeted_hour'] / 24) else: time = 0 vector.append(time) day = float(tweet.tweet['created_day'] / 4) hour = float(0.25 * tweet.tweet['tweeted_hour'] / 24) vector.append(day) vector.append(hour) tweet_vectors.append(vector) if len(tweet_vectors) == 0: return (f'No tweets - {cluster_number}') #clustering df = pd.DataFrame(tweet_vectors) clustring = DBSCAN(eps=0.25, min_samples=1).fit(df) for i in range(len(clustring.labels_)): tweets_t1.tweets[i].tweet['thread_number'] = clustring.labels_[i] tweet_df = pd.DataFrame([tweet.tweet for tweet in tweets_t1.tweets]) columns_to_remove = [ '_id', 'id_str', 'truncated', 'entities', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status', 'favorited', 'retweeted', 'lang', 'downloaded_day_year', 'downloaded_year', 'extended_entities', 'possibly_sensitive', 'quoted_status_id', 'quoted_status_id_str', 'quoted_status', 'retweeted_status' ] for column in columns_to_remove: try: tweet_df.drop(column, axis=1, inplace=True) except Exception as e: pass tweet_df['screen_name'] = tweet_df['user'].apply( lambda x: x['screen_name']) tweet_df['friends_count'] = tweet_df['user'].apply( lambda x: x['friends_count']) tweet_df['followers_count'] = tweet_df['user'].apply( lambda x: x['followers_count']) tweet_df.drop('user', axis=1, inplace=True) tweet_df['tweeted_at'] = tweet_df.apply( lambda row: datetime.datetime(row.created_year, 1, 1, row.tweeted_hour) + datetime.timedelta(row.created_day - 1), axis=1) tweet_df['tweet_url'] = tweet_df.apply( lambda row: f'https://twitter.com/{row.screen_name}/status/{row.id}', axis=1) tier = 1 while (tier <= 3): temp_df = tweet_df[tweet_df['tier'] == tier] if len(temp_df) == 0: print( f'{tier}-{cluster_number},{tier}-{cluster_number},{tier}-{cluster_number},{tier}-{cluster_number},{tier}-{cluster_number},{tier}-{cluster_number},{tier}-{cluster_number},{tier}-{cluster_number},{tier}-{cluster_number},{tier}-{cluster_number},{tier}-{cluster_number},{tier}-{cluster_number},{tier}-{cluster_number},{tier}-{cluster_number},{tier}-{cluster_number},{tier}-{cluster_number},{tier}-{cluster_number}' ) else: print( f'{tier}-{cluster_number}*,{tier}-{cluster_number}*,{tier}-{cluster_number}*,{tier}-{cluster_number}*,{tier}-{cluster_number}*,{tier}-{cluster_number}*,{tier}-{cluster_number}*,{tier}-{cluster_number}*,{tier}-{cluster_number}*,{tier}-{cluster_number}*,{tier}-{cluster_number}*,{tier}-{cluster_number}*,{tier}-{cluster_number}*,{tier}-{cluster_number}*,{tier}-{cluster_number}*,{tier}-{cluster_number}*,{tier}-{cluster_number}*' ) thread_df = {} thread_df['thread_number'] = list( temp_df.groupby('thread_number').count().reset_index() ['thread_number']) time_stamps = [] count = [] for i in thread_df['thread_number']: time_df = temp_df[temp_df['thread_number'] == i] time_stamps.append(min(time_df['tweeted_at'])) count.append(len(time_df)) thread_df['time_stamps'] = time_stamps thread_df['count'] = count thread_df = pd.DataFrame(thread_df) plot_df = pd.DataFrame( thread_df.groupby('time_stamps').count().reset_index() ['time_stamps']) plot_df['thread_count'] = thread_df.groupby( 'time_stamps').count().reset_index()['count'] plot_df['tweet_count'] = thread_df.groupby( 'time_stamps').sum().reset_index()['count'] stamps = list(plot_df[plot_df['thread_count'] > ( plot_df['thread_count'].mean() + 2 * plot_df['thread_count'].std())]['time_stamps']) for i in thread_df['thread_number']: time_df = temp_df[temp_df['thread_number'] == i] time_stamp = min(time_df['tweeted_at']) if time_stamp in stamps: for j in range(len(time_df)): id = int(time_df.iloc[j]['id']) temp_tier = int(time_df.iloc[j]['tier']) thread_number = i discover = True query = {'id': id} payload = { '$set': { 'tier': temp_tier, 'thread_number': thread_number, 'discover': discover, 'cluster_number': cluster_number } } mongo_app.db.tweets.update(query, payload) else: for j in range(len(time_df)): id = int(time_df.iloc[j]['id']) temp_tier = int(time_df.iloc[j]['tier']) thread_number = i discover = False query = {'id': id} payload = { '$set': { 'tier': temp_tier, 'thread_number': thread_number, 'discover': discover, 'cluster_number': cluster_number } } mongo_app.db.tweets.update(query, payload) tier += 1
def index(): query = dict(request.get_json()) mongo_app = connections.mongo() mongo_app.connect_to_mongo() mongo_app.db.users.remove({}) tweets = tweet_objects.tweet_set() #todays day year day = int(datetime.date.today().strftime('%j')) today = datetime.date.today() if day - 7 >= 0: query = { '$and': [{ 'created_day': { '$gt': day - 7 } }, { 'created_year': today.year }] } tweets.get_tweets(query) else: query = { '$and': [{ 'created_day': { '$gt': 0 } }, { 'created_year': today.year }] } tweets.get_tweets(query) last_year_days = int( datetime.datetime(today.year - 1, 12, 31).strftime('%j')) query = { '$and': [{ 'created_day': { '$gt': last_year_days + day - 7 } }, { 'created_year': today.year - 1 }] } tweets.get_tweets(query) tweets.preprocess_tweets() mute_users = get_mute_users() existing_users = user_objects.user_set() user_query = {} existing_users.get_users(user_query) mute_users += [user.user['screen_name'] for user in existing_users.users] discovered_users = [] for tweet in tweets.tweets: discovered_users += tweet.tweet_mentions discovered_users = Counter(discovered_users) for user_screen_name in mute_users: try: discovered_users.pop(user_screen_name) except: pass final_users = [] for k, v in discovered_users.items(): temp = {} temp['screen_name'] = k temp['activity'] = v final_users.append(temp) final_users = final_users[:30] #extra processing mongo_app = connections.mongo() mongo_app.connect_to_mongo() for user in final_users: user['core_user'] = None user['status'] = None user['created_year'] = int(datetime.datetime.now().year) user['created_day'] = int(datetime.datetime.now().strftime('%j')) user['made_user'] = False mongo_app.db.users.insert(user) return ('done')
def tweets_celery_collector(self, query): #connecting to db mongo_app = connections.mongo() mongo_app.connect_to_mongo() #today today = datetime.date.today() #connecting to twitter api_keys = connections.twitter_api_keys() api_keys.get_existing_keys(no_of_keys=1) api = api_keys.keys[0].connect_to_twitter() #setting time period todaydt = datetime.datetime(today.year, today.month, today.day, 0, 0, 0) startDate = todaydt - datetime.timedelta(days=query['days']) endDate = todaydt - datetime.timedelta(days=2) #set status in db update_query = { '$set': { f"last_day.{query['source']}": int(today.strftime('%j')), f"tweets_status.{query['source']}": 'collecting', f"duration.{query['source']}": f"{startDate} - {endDate}" } } mongo_app.db.reports.update({'report': 'data_collection'}, update_query) #getting users users = user_objects.user_set() payload = query['user_payload'] users.get_users(payload) #collect tweets tweets = [] i = 0 j = 1 count = 0 for user in users.users: i = i + 1 try: tmpTweets = api.user_timeline(user.user['screen_name']) except tweepy.TweepError as e: try: if e.reason[0:22] == 'Failed to send request': return ('No Network') except: print('Skipping' + user.user['screen_name']) print(i, user.user['screen_name'], j) for tweet in tmpTweets: if tweet.created_at < endDate and tweet.created_at > startDate: tweet._json['created_day'] = int( tweet.created_at.strftime('%j')) tweet._json['created_year'] = tweet.created_at.year tweet._json['tweeted_hour'] = int( tweet._json['created_at'][11:13]) tweets.append(tweet) try: while (tmpTweets[-1].created_at > startDate): print("Last Tweet @", tmpTweets[-1].created_at, " - fetching some more") i = i + 1 try: tmpTweets = api.user_timeline(user.user['screen_name'], max_id=tmpTweets[-1].id) except tweepy.TweepError as e: try: if e.reason[0:22] == 'Failed to send request': return ('No Network') except: print('Skipping' + user.user['screen_name']) print(i, user.user['screen_name'], j) for tweet in tmpTweets: if tweet.created_at < endDate and tweet.created_at > startDate: tweet._json['created_day'] = int( tweet.created_at.strftime('%j')) tweet._json['created_year'] = tweet.created_at.year tweet._json['tweeted_hour'] = int( tweet._json['created_at'][11:13]) tweets.append(tweet) except IndexError: print('*=*=*=*= NO TWEETS BY *=*=*=*=*=', user, j) j = j + 1 #updating status in reports mongo_app.db.reports.update({'report': 'data_collection'}, { '$set': { f"tweets_user.{query['source']}": user.user['screen_name'] } }) #pulling json part of tweets status collected tweets_json = [] for status in tweets: tweets_json.append(status._json) #removing duplicates ids = {} duplicates = [] index = 0 for tweet in tweets_json: if tweet['id'] in ids: duplicates.append(index) else: ids[tweet['id']] = 0 index += 1 for index in sorted(duplicates, reverse=True): del tweets[index] #inserting to database i = 0 for tweet in tweets_json: tweet['downloaded_day_year'] = int(today.strftime('%j')) tweet['downloaded_year'] = int(today.year) if '_id' in tweet: tweet.pop('_id') print(i) if not mongo_app.db.tweets.find({'id': tweet['id']}).count() > 0: mongo_app.db.tweets.insert_one(tweet) count += 1 i = i + 1 #set status in db mongo_app.db.reports.update({'report': 'data_collection'}, { '$set': { f"tweets_status.{query['source']}": 'collected', f"quantity.{query['source']}": count } })
def user_friends_celery_collector(self, query): #connecting to db mongo_app = connections.mongo() mongo_app.connect_to_mongo() #today today = datetime.date.today() #connecting to twitter api_keys = connections.twitter_api_keys() api_keys.get_existing_keys() #set status in db update_query = { '$set': { f"last_day.{query['source']}": int(today.strftime('%j')), f"user_friends_status.{query['source']}": 'collecting' } } mongo_app.db.reports.update({'report': 'data_collection'}, update_query) #getting users users = user_objects.user_set() payload = query['user_friends_payload'] users.get_users(payload) #checking if it stopped in between reports = [] for report in mongo_app.db.reports.find({'report': 'data_collection'}): reports.append(report) if len(reports) != 0: if reports[0]['friends_user']['type'] == query['type']: if reports[0]['friends_user']['user_no'] == len(users.users) - 1: user_no = 0 elif reports[0]['friends_user']['user_no'] == 0: user_no = 0 else: user_no = reports[0]['friends_user']['user_no'] - 1 else: user_no = 0 key_no = 0 api = api_keys.keys[key_no].connect_to_twitter_no_wait() while user_no < len(users.users): try: i = 0 friends = [] for id in tweepy.Cursor(api.friends_ids, screen_name=users.users[user_no]. user['screen_name']).items(): friends.append({'id': id}) i = i + 1 #Printing Status print(i, users.users[user_no].user['screen_name'], user_no) #updating dB mongo_app.db.community.update_one( {'_id': users.user_ids[user_no]}, {'$set': { 'friends_id': friends }}) #writing status to reports mongo_app.db.reports.update({'report': 'data_collection'}, { '$set': { 'friends_user': { 'screen_name': users.users[user_no].user['screen_name'], 'user_no': user_no, 'type': query['type'] } } }) user_no = user_no + 1 except tweepy.RateLimitError: time.sleep(60) key_no = (key_no + 1) % (len(api_keys.keys)) api = api_keys.keys[key_no].connect_to_twitter_no_wait() print(key_no) print(88) except Exception as e: try: if e.reason[0:22] == 'Failed to send request': return ('No Network') except: print('skipping ' + users.users[user_no].user['screen_name']) print(e) user_no += 1 #set status in db mongo_app.db.reports.update( {'report': 'data_collection'}, {'$set': { f"user_friends_status.{query['source']}": 'collected' }})
def process(self, query): weights = query['weights'] user = user_objects.user_set() user.get_users({'user.screen_name': query['user']}) user = user.users[0] all_users = user_objects.user_set() all_users.get_users({}) user.edges = [] for each_user in all_users.users: if user.user['screen_name'] != each_user.user['screen_name']: edge = {} edge['id'] = each_user.user['id'] edge['screen_name'] = each_user.user['screen_name'] is_follower = False for id in each_user.friends_id: if id['id'] == user.user['id']: is_follower = True break edge['is_follower'] = is_follower edge['bond'] = 0 user.edges.append(edge) tweets = tweet_objects.tweet_set() #today and seven days back today = datetime.date.today() seven_days_back = today - datetime.timedelta(days=7) if seven_days_back.year == today.year: tweets_query = { '$and': [{ 'user.id': user.user['id'] }, { '$and': [{ 'created_day': { '$gt': int(seven_days_back.strftime('%j')) } }, { 'created_year': seven_days_back.year }] }, { '$and': [{ 'created_day': { '$lte': int(today.strftime('%j')) } }, { 'created_year': today.year }] }] } tweets.get_tweets(tweets_query) else: tweets_query = { '$and': [{ 'user.id': user.user['id'] }, { '$and': [{ 'created_day': { '$gt': int(seven_days_back.strftime('%j')) } }, { 'created_year': seven_days_back.year }] }] } tweets.get_tweets(tweets_query) tweets_query = { '$and': [{ 'user.id': user.user['id'] }, { '$and': [{ 'created_day': { '$lte': int(today.strftime('%j')) } }, { 'created_year': today.year }] }] } tweets.get_tweets(tweets_query) tweets.preprocess_tweets() user.activity = len(tweets.tweets) now_weights = {} now_weights['base'] = weights['tweet_weight'] now_weights['tweet'] = weights['tweet_mention_weight'] now_weights['retweet'] = weights['retweet_weight'] now_weights['quote'] = weights['quote_weight'] now_weights['reply'] = weights['reply_weight'] for tweet in tweets.tweets: for edge in user.edges: if edge['is_follower']: edge['bond'] = edge['bond'] + now_weights['base'] if edge['screen_name'] in tweet.tweet_mentions: edge['bond'] = edge['bond'] + now_weights[tweet.tweet_type] mongo_app = connections.mongo() mongo_app.connect_to_mongo() mongo_app.db.community.update( {'user.screen_name': user.user['screen_name']}, {'$set': { 'activity': user.activity, 'edges': user.edges }})