def __init__(self, discord_client, server_id, channels=[]): self.server_id = server_id self.channels = channels self.client = discord_client self.db_client = PGClient() self.discord_client = discord_client self.unicode_emote_list = map(lambda x: ''.join(x.split()), emoji.UNICODE_EMOJI.keys()) self.unicode_emote_ptrn = re.compile('|'.join(re.escape(p) for p in self.unicode_emote_list)) self.custom_emote_ptrn = re.compile('<:\w+:[0-9]+>')
def __init__(self, channel_id, first_id, last_id, type): set_api_key(settings.PARALLELDOTS_API_KEY) self.channel_id = channel_id self.first_id = first_id self.last_id = last_id self.type = type self.IGNORED_MEMBER_IDS = settings.IGNORED_MEMBER_IDS self.db_client = PGClient() self.MAX_PIECE_LEN = 3000 self.init_regex_patterns()
class DataProcessor: def __init__(self, discord_client, server_id, channels=[]): self.server_id = server_id self.channels = channels self.client = discord_client self.db_client = PGClient() self.discord_client = discord_client self.unicode_emote_list = map(lambda x: ''.join(x.split()), emoji.UNICODE_EMOJI.keys()) self.unicode_emote_ptrn = re.compile('|'.join(re.escape(p) for p in self.unicode_emote_list)) self.custom_emote_ptrn = re.compile('<:\w+:[0-9]+>') async def collect_data(self): """ Download messages from server """ client = self.client channels = self.channels logging.info('server id: {}'.format(self.server_id)) server = client.get_server(self.server_id) for m in server.members: self.db_client.save_member(m) chosen_channels = [] if self.channels: for c in server.channels: if c.id in channels: chosen_channels.append(c) else: chosen_channels = server.channels i = 0 for c in chosen_channels: logging.info('Downloading messages from channel {} ...'.format(c)) c_i = 0 async for log in client.logs_from(c, limit=1000000000): await self.save_data(log) if i % 1000 == 0: logging.info('Processed {} messages in total'.format(i)) i += 1 c_i += 1 logging.info( 'Channel {} done. Downloaded {} messages'.format(c, c_i) ) async def save_data(self, log): """ Process and save downloaded data """ reactions = await self.reactions_to_dict(log.reactions) emotes = self.extract_emotes(log) self.db_client.save_reactions(reactions) self.db_client.save_emotes(emotes) self.db_client.save_message(log) self.db_client.save_member(log.author) async def reactions_to_dict(self, reactions): """ Transform reactions to list of dictionaries """ reactions_dict = [] for r in reactions: r_members = [] if r.custom_emoji: emote = r.emoji.id else: emote = r.emoji members = \ await self.discord_client.get_reaction_users(r, limit=100) members = [m.id for m in members] reactions_dict.append({'message_id': r.message.id, 'emote_id': emote, 'members': members, }) return reactions_dict def extract_emotes(self, log): """ Compile regex patterns """ body = log.content member_id = log.author.id posted_at = log.timestamp custom_emotes = {} unicode_emotes = {} for e in re.findall(self.custom_emote_ptrn, body): e_name, e_id = e[2:-1].split(':') if e_id not in custom_emotes: custom_emotes[e_id] = {'name': e_name, 'count': 1, 'member_id': member_id, 'posted_at': posted_at } else: custom_emotes[e_id]['name'] = e_name custom_emotes[e_id]['count'] += 1 for e in re.findall(self.unicode_emote_ptrn, body): if e not in unicode_emotes: unicode_emotes[e] = {'name': e, 'count': 1, 'member_id': member_id, 'posted_at': posted_at } else: unicode_emotes[e]['count'] += 1 return {**custom_emotes, **unicode_emotes}
def __init__(self): self.client = PGClient()
class Activity: def __init__(self): self.client = PGClient() def per_day(self): """ Return average amount of messages per day in a week """ q_grouped_days = """ SELECT extract(isodow from posted_at)::int AS day_group, count(*) FROM messages WHERE member_id NOT IN %s GROUP BY day_group ORDER BY day_group """ q_first_msg = """ SELECT date_trunc('day', posted_at) FROM messages WHERE member_id NOT IN %s AND posted_at >= '2017-08-01' ORDER BY posted_at LIMIT 1 """ q_last_msg = """ SELECT date_trunc('day', posted_at) FROM messages WHERE member_id NOT IN %s ORDER BY posted_at DESC LIMIT 1 """ values = (settings.IGNORED_MEMBER_IDS, ) grouped_days = self.client.query(q_grouped_days, values).fetchall() sums_per_day = [s[1] for s in grouped_days] date_from = self.client.query(q_first_msg, values).fetchall()[0][0] date_to = self.client.query(q_last_msg, values).fetchall()[0][0] avg_activity = self.get_avg_activity(date_from=date_from, date_to=date_to, sums_per_segment=sums_per_day, time_unit='day') return avg_activity def per_hour(self): """ Return average amount of messages per hour in a day """ q_grouped_hours = """ SELECT extract(hour from posted_at)::int AS hour_group, count(*) FROM messages WHERE member_id NOT IN %s GROUP BY hour_group ORDER BY hour_group """ q_first_msg = """ SELECT date_trunc('hour', posted_at) FROM messages WHERE member_id NOT IN %s AND posted_at >= '2017-08-01' ORDER BY posted_at LIMIT 1 """ q_last_msg = """ SELECT date_trunc('hour', posted_at) FROM messages WHERE member_id NOT IN %s ORDER BY posted_at DESC LIMIT 1 """ values = (settings.IGNORED_MEMBER_IDS, ) grouped_hours = self.client.query(q_grouped_hours, values).fetchall() sums_per_hour = [s[1] for s in grouped_hours] date_from = self.client.query(q_first_msg, values).fetchall()[0][0] date_to = self.client.query(q_last_msg, values).fetchall()[0][0] avg_activity = self.get_avg_activity(date_from=date_from, date_to=date_to, sums_per_segment=sums_per_hour, time_unit='hour') return avg_activity def get_avg_activity(self, date_from, date_to, sums_per_segment, time_unit): """ Return average amount of messages per the chosen time segment (hour or day) """ if time_unit == 'day': time_segments = [0] * 7 td = date_to - date_from + timedelta(days=1) complete_period = int(td.days / 7) remainder = td.days % 7 first_segment = date_from.weekday() else: time_segments = [0] * 24 td = date_to - date_from + timedelta(hours=1) complete_period = int(td.total_seconds() / 60 / 60 / 24) remainder = td.days % 7 first_segment = date_from.hour for segment in range(len(time_segments)): time_segments[segment] = complete_period for i in range(remainder): time_segments[(first_segment + i) % len(time_segments)] += 1 avg_sum_per_segment = [(time_unit, 'count')] n = 0 for seg_sum, seg in zip(sums_per_segment, time_segments): avg_sum = round(seg_sum / seg) avg_sum_per_segment.append((n, avg_sum)) n += 1 return avg_sum_per_segment
class DiscussionAnalyzer: def __init__(self, channel_id, first_id, last_id, type): set_api_key(settings.PARALLELDOTS_API_KEY) self.channel_id = channel_id self.first_id = first_id self.last_id = last_id self.type = type self.IGNORED_MEMBER_IDS = settings.IGNORED_MEMBER_IDS self.db_client = PGClient() self.MAX_PIECE_LEN = 3000 self.init_regex_patterns() def init_regex_patterns(self): """ Compile regex patterns """ unicode_emote_list = map(lambda x: ''.join(x.split()), emoji.UNICODE_EMOJI.keys()) self.unicode_emote_ptrn = re.compile('|'.join(re.escape(p) for p in unicode_emote_list)) self.custom_emote_ptrn = re.compile('<:\w+:[0-9]+>') self.tag_ptrn = re.compile('<@!?[0-9]+>') self.url_ptrn = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') def analyze(self): """ Return analysis results """ if self.type == 'both': return [self.__analyze(), self.__analyze_by_member()] elif self.type == 'member': return [self.__analyze_by_member()] else: return [self.__analyze()] def __analyze(self): """ Analyze discussion emotions and sentiments """ d = self.get_messages() d = self.clean(self.extract_body(d)) num_of_characters = len(d) d = self.split_by_len(d) feelings = self.get_feelings(d) feelings[0] = ('num_of_characters',) + feelings[0] feelings[1] = (num_of_characters,) + feelings[1] return feelings def __analyze_by_member(self): """ Analyze messages of each member separately """ member_d = self.get_messages(by_member=True) results = [] for member_id, d in member_d.items(): d = self.clean(self.extract_body(d)) num_of_characters = len(d) if num_of_characters < 1: continue d = self.split_by_len(d) feelings = self.get_feelings(d) if not results: header = ('member_id', 'member_name', 'num_of_characters') + feelings[0] results.append(header) member_name = self.__member_name_by_id(member_id) row = (member_id, member_name, num_of_characters) + feelings[1] results.append(row) return results def get_messages(self, by_member=False): """ Return messages from db. If by_member == true, return dict where key is a member and value is the list of his messages """ q_messages = """SELECT json_build_object( 'id',id,'posted_at',posted_at, 'content',content, 'member_id', member_id) FROM messages WHERE channel_id = %s AND member_id NOT IN %s AND id >= %s AND id <= %s ORDER BY id""" values = (self.channel_id, self.IGNORED_MEMBER_IDS, self.first_id, self.last_id) cursor = self.db_client.query(q_messages, values) if by_member: messages = {} message = cursor.fetchone() while message: message = message[0] member_id = str(message['member_id']) if member_id in messages: messages[member_id].append(message) else: messages[member_id] = [message] message = cursor.fetchone() else: messages = [m[0] for m in cursor.fetchall()] return messages def extract_body(self, messages): """ Return the body of message """ return ' '.join([m['content'] for m in messages]) def clean(self, t): """ Clean text from url, tags, emotes, multiple spaces, new line symbols and double quotes """ t = str(t) emotes = re.findall(self.custom_emote_ptrn, t) for e in emotes: e_new = e.split(':')[1].lower() t = re.sub(e, '', t) t = self.unicode_emote_ptrn.sub('', t) t = re.sub(self.url_ptrn, '', t) t = re.sub(self.tag_ptrn, '', t) t = re.sub('\n', ' ', t) t = re.sub('"', '', t) t = re.sub('\s+', ' ', t) return t.strip() def split_by_len(self, text): """ Split text to segments where max length of each segment is MAX_PIECE_LEN """ max_len = self.MAX_PIECE_LEN current_len = 0 text_piece = '' texts = [] for w in text.split(' '): if (current_len + len(w)) < max_len: text_piece += w + ' ' else: texts.append(text_piece.strip()) text_piece = w + ' ' current_len = len(text_piece) texts.append(text_piece.strip()) return texts def get_feelings(self, texts): """ Return average of feelings values """ t_feelings = { 'sentiment': { 's_negative': [], 's_neutral': [], 's_positive': [] }, 'emotion': { 'e_angry': [], 'e_excited': [], 'e_happy': [], 'e_indifferent': [], 'e_sad': [] } } for t_part in texts: sent = sentiment(t_part) emot = emotion(t_part) t_feelings['sentiment']['s_negative'].append(sent['probabilities']['negative']) t_feelings['sentiment']['s_neutral'].append(sent['probabilities']['neutral']) t_feelings['sentiment']['s_positive'].append(sent['probabilities']['positive']) t_feelings['emotion']['e_angry'].append(emot['probabilities']['angry']) t_feelings['emotion']['e_excited'].append(emot['probabilities']['excited']) t_feelings['emotion']['e_happy'].append(emot['probabilities']['happy']) t_feelings['emotion']['e_indifferent'].append(emot['probabilities']['indifferent']) t_feelings['emotion']['e_sad'].append(emot['probabilities']['sad']) t_feelings['sentiment'] = ( self.scale_to_one( self.get_avg_feelings(t_feelings['sentiment'])) ) t_feelings['emotion'] = ( self.scale_to_one( self.get_avg_feelings(t_feelings['emotion'])) ) result = [(*t_feelings['sentiment'].keys(), *t_feelings['emotion'].keys()), (*t_feelings['sentiment'].values(), *t_feelings['emotion'].values()) ] return result def get_avg_feelings(self, feelings): """ Return average of feelings values """ len_feelings = len(feelings) avg_feelings = {} for f, vals in feelings.items(): avg_feelings[f] = sum([v for v in vals]) / len_feelings return avg_feelings def scale_to_one(self, feelings): """ Scale values of feelings to make their sum equal to 1 """ keys = feelings.keys() values = np.array(list(feelings.values())) values = values / values.sum() return dict(zip(keys, values)) def __member_name_by_id(self, member_id): """ Return member name by his id """ q = "SELECT name FROM members WHERE id = %s" cursor = self.db_client.query(q, (member_id,)) name = cursor.fetchone()[0] return name
def __init__(self): self.client = PGClient() self.limit = settings.RESULT_LIMIT
class SQLStats: def __init__(self): self.client = PGClient() self.limit = settings.RESULT_LIMIT def query_stats(self, q, values): """ Execute query and return list of tuples """ cursor = self.client.query(q, values) column_names = list(zip(*cursor.description))[0] return [column_names, *cursor.fetchall()] def most_reacting(self): q = """ SELECT r.member_id, m.name, count(*) FROM reactions r JOIN members m ON r.member_id = m.id WHERE r.member_id NOT IN %s GROUP BY r.member_id, m.name ORDER BY count DESC LIMIT %s """ return self.query_stats(q, (settings.IGNORED_MEMBER_IDS, self.limit)) def most_reacted(self, order_by_ratio=False): order_by_cols = ['num_of_reactions', 'ratio'] if order_by_ratio: order_by = order_by_cols[1] else: order_by = order_by_cols[0] q = """ WITH reaction_count AS ( SELECT m.member_id, count(*) AS num_of_reactions FROM reactions r JOIN messages m ON m.id = r.message_id GROUP BY m.member_id ), message_count AS ( SELECT member_id, count(*) as num_of_messages FROM messages GROUP BY member_id ) SELECT m.name, rc.num_of_reactions, mc.num_of_messages, rc.num_of_reactions / mc.num_of_messages::float as ratio FROM reaction_count rc JOIN message_count mc ON mc.member_id = rc.member_id JOIN members m ON m.id = rc.member_id WHERE rc.member_id NOT IN %s AND mc.num_of_messages >= 10 ORDER BY {} DESC LIMIT %s """.format(order_by) return self.query_stats(q, (settings.IGNORED_MEMBER_IDS, self.limit)) def activity_trend(self): q = """ SELECT extract(month from posted_at)::int AS m, extract(year from posted_at)::int AS y, count(*) FROM messages WHERE member_id NOT IN %s GROUP BY y, m ORDER BY y,m """ return self.query_stats(q, (settings.IGNORED_MEMBER_IDS, )) def most_used_emotes(self): q = """ WITH _emotes AS ( SELECT lower(name) AS e_name, sum(count) AS e_count FROM emotes WHERE member_id NOT IN %s GROUP BY e_name ), _reactions AS ( SELECT lower(e.name) as e_name, count(*) AS r_count FROM reactions r JOIN emotes e ON e.emote_id = r.emote_id WHERE r.member_id NOT IN %s GROUP BY e_name ) SELECT e.e_name, e.e_count, r.r_count, (e.e_count + r.r_count) AS total FROM _emotes e JOIN _reactions r ON r.e_name = e.e_name ORDER BY total DESC LIMIT %s; """ ignored_m = 2 * (settings.IGNORED_MEMBER_IDS, ) values = (*ignored_m, self.limit) return self.query_stats(q, values) def most_active_member(self): q = """ SELECT member_id, mbr.name, count(*) FROM messages m JOIN members mbr ON mbr.id = m.member_id WHERE m.member_id NOT IN %s GROUP BY m.member_id, mbr.name ORDER BY count DESC LIMIT %s """ return self.query_stats(q, (settings.IGNORED_MEMBER_IDS, self.limit)) def most_mentioned_member(self): q = """ WITH u_mentions AS ( SELECT unnest(mentions) as _member_id, count(*) FROM messages GROUP BY _member_id ) SELECT mbr.name, u.* FROM u_mentions u JOIN members mbr ON mbr.id = u._member_id WHERE u._member_id NOT IN %s ORDER BY u.count DESC LIMIT %s """ return self.query_stats(q, (settings.IGNORED_MEMBER_IDS, self.limit))