Пример #1
0
    def __init__(self, discord_client, server_id, channels=[]):
        self.server_id = server_id
        self.channels = channels
        self.client = discord_client
        self.db_client = PGClient()
        self.discord_client = discord_client

        self.unicode_emote_list = map(lambda x: ''.join(x.split()),
                                      emoji.UNICODE_EMOJI.keys())
        self.unicode_emote_ptrn = re.compile('|'.join(re.escape(p)
                                             for p in self.unicode_emote_list))
        self.custom_emote_ptrn = re.compile('<:\w+:[0-9]+>')
Пример #2
0
    def __init__(self, channel_id,
                 first_id, last_id, type):
        set_api_key(settings.PARALLELDOTS_API_KEY)
        self.channel_id = channel_id
        self.first_id = first_id
        self.last_id = last_id
        self.type = type

        self.IGNORED_MEMBER_IDS = settings.IGNORED_MEMBER_IDS
        self.db_client = PGClient()
        self.MAX_PIECE_LEN = 3000
        self.init_regex_patterns()
Пример #3
0
class DataProcessor:

    def __init__(self, discord_client, server_id, channels=[]):
        self.server_id = server_id
        self.channels = channels
        self.client = discord_client
        self.db_client = PGClient()
        self.discord_client = discord_client

        self.unicode_emote_list = map(lambda x: ''.join(x.split()),
                                      emoji.UNICODE_EMOJI.keys())
        self.unicode_emote_ptrn = re.compile('|'.join(re.escape(p)
                                             for p in self.unicode_emote_list))
        self.custom_emote_ptrn = re.compile('<:\w+:[0-9]+>')

    async def collect_data(self):
        """
        Download messages from server
        """
        client = self.client
        channels = self.channels
        logging.info('server id: {}'.format(self.server_id))
        server = client.get_server(self.server_id)

        for m in server.members:
            self.db_client.save_member(m)

        chosen_channels = []

        if self.channels:
            for c in server.channels:
                if c.id in channels:
                    chosen_channels.append(c)
        else:
            chosen_channels = server.channels

        i = 0
        for c in chosen_channels:
            logging.info('Downloading messages from channel {} ...'.format(c))
            c_i = 0
            async for log in client.logs_from(c, limit=1000000000):
                await self.save_data(log)

                if i % 1000 == 0:
                    logging.info('Processed {} messages in total'.format(i))
                i += 1
                c_i += 1
            logging.info(
                'Channel {} done. Downloaded {} messages'.format(c, c_i)
                )

    async def save_data(self, log):
        """
        Process and save downloaded data
        """
        reactions = await self.reactions_to_dict(log.reactions)
        emotes = self.extract_emotes(log)

        self.db_client.save_reactions(reactions)
        self.db_client.save_emotes(emotes)
        self.db_client.save_message(log)
        self.db_client.save_member(log.author)

    async def reactions_to_dict(self, reactions):
        """
        Transform reactions to list of dictionaries
        """
        reactions_dict = []
        for r in reactions:
            r_members = []
            if r.custom_emoji:
                emote = r.emoji.id
            else:
                emote = r.emoji

            members = \
                await self.discord_client.get_reaction_users(r, limit=100)
            members = [m.id for m in members]

            reactions_dict.append({'message_id': r.message.id,
                                    'emote_id': emote,
                                    'members': members,
                                    })
        return reactions_dict

    def extract_emotes(self, log):
        """
        Compile regex patterns
        """
        body = log.content
        member_id = log.author.id
        posted_at = log.timestamp
        custom_emotes = {}
        unicode_emotes = {}

        for e in re.findall(self.custom_emote_ptrn, body):
            e_name, e_id = e[2:-1].split(':')
            if e_id not in custom_emotes:
                custom_emotes[e_id] = {'name': e_name,
                                       'count': 1,
                                       'member_id': member_id,
                                       'posted_at': posted_at
                                       }
            else:
                custom_emotes[e_id]['name'] = e_name
                custom_emotes[e_id]['count'] += 1

        for e in re.findall(self.unicode_emote_ptrn, body):
            if e not in unicode_emotes:
                unicode_emotes[e] = {'name': e,
                                     'count': 1,
                                     'member_id': member_id,
                                     'posted_at': posted_at
                                     }
            else:
                unicode_emotes[e]['count'] += 1

        return {**custom_emotes, **unicode_emotes}
Пример #4
0
 def __init__(self):
     self.client = PGClient()
Пример #5
0
class Activity:
    def __init__(self):
        self.client = PGClient()

    def per_day(self):
        """
        Return average amount of messages per day in a week
        """
        q_grouped_days = """
            SELECT extract(isodow from posted_at)::int AS day_group, count(*)
            FROM messages
            WHERE member_id NOT IN %s
            GROUP BY day_group
            ORDER BY day_group
            """
        q_first_msg = """
            SELECT date_trunc('day', posted_at)
            FROM messages
            WHERE member_id NOT IN %s
                AND posted_at >= '2017-08-01'
            ORDER BY posted_at
            LIMIT 1
        """
        q_last_msg = """
            SELECT date_trunc('day', posted_at)
            FROM messages
            WHERE member_id NOT IN %s
            ORDER BY posted_at DESC
            LIMIT 1
        """

        values = (settings.IGNORED_MEMBER_IDS, )

        grouped_days = self.client.query(q_grouped_days, values).fetchall()
        sums_per_day = [s[1] for s in grouped_days]

        date_from = self.client.query(q_first_msg, values).fetchall()[0][0]
        date_to = self.client.query(q_last_msg, values).fetchall()[0][0]

        avg_activity = self.get_avg_activity(date_from=date_from,
                                             date_to=date_to,
                                             sums_per_segment=sums_per_day,
                                             time_unit='day')
        return avg_activity

    def per_hour(self):
        """
        Return average amount of messages per hour in a day
        """
        q_grouped_hours = """
            SELECT extract(hour from posted_at)::int AS hour_group, count(*)
            FROM messages
            WHERE member_id NOT IN %s
            GROUP BY hour_group
            ORDER BY hour_group
            """
        q_first_msg = """
            SELECT date_trunc('hour', posted_at)
            FROM messages
            WHERE member_id NOT IN %s
                AND posted_at >= '2017-08-01'
            ORDER BY posted_at
            LIMIT 1
        """
        q_last_msg = """
            SELECT date_trunc('hour', posted_at)
            FROM messages
            WHERE member_id NOT IN %s
            ORDER BY posted_at DESC
            LIMIT 1
        """
        values = (settings.IGNORED_MEMBER_IDS, )

        grouped_hours = self.client.query(q_grouped_hours, values).fetchall()
        sums_per_hour = [s[1] for s in grouped_hours]

        date_from = self.client.query(q_first_msg, values).fetchall()[0][0]
        date_to = self.client.query(q_last_msg, values).fetchall()[0][0]

        avg_activity = self.get_avg_activity(date_from=date_from,
                                             date_to=date_to,
                                             sums_per_segment=sums_per_hour,
                                             time_unit='hour')
        return avg_activity

    def get_avg_activity(self, date_from, date_to, sums_per_segment,
                         time_unit):
        """
        Return average amount of messages
        per the chosen time segment (hour or day)
        """
        if time_unit == 'day':
            time_segments = [0] * 7
            td = date_to - date_from + timedelta(days=1)
            complete_period = int(td.days / 7)
            remainder = td.days % 7
            first_segment = date_from.weekday()
        else:
            time_segments = [0] * 24
            td = date_to - date_from + timedelta(hours=1)
            complete_period = int(td.total_seconds() / 60 / 60 / 24)
            remainder = td.days % 7
            first_segment = date_from.hour

        for segment in range(len(time_segments)):
            time_segments[segment] = complete_period

        for i in range(remainder):
            time_segments[(first_segment + i) % len(time_segments)] += 1

        avg_sum_per_segment = [(time_unit, 'count')]
        n = 0
        for seg_sum, seg in zip(sums_per_segment, time_segments):
            avg_sum = round(seg_sum / seg)
            avg_sum_per_segment.append((n, avg_sum))
            n += 1

        return avg_sum_per_segment
Пример #6
0
class DiscussionAnalyzer:

    def __init__(self, channel_id,
                 first_id, last_id, type):
        set_api_key(settings.PARALLELDOTS_API_KEY)
        self.channel_id = channel_id
        self.first_id = first_id
        self.last_id = last_id
        self.type = type

        self.IGNORED_MEMBER_IDS = settings.IGNORED_MEMBER_IDS
        self.db_client = PGClient()
        self.MAX_PIECE_LEN = 3000
        self.init_regex_patterns()

    def init_regex_patterns(self):
        """
        Compile regex patterns
        """
        unicode_emote_list = map(lambda x: ''.join(x.split()),
                                 emoji.UNICODE_EMOJI.keys())
        self.unicode_emote_ptrn = re.compile('|'.join(re.escape(p)
                                             for p in unicode_emote_list))
        self.custom_emote_ptrn = re.compile('<:\w+:[0-9]+>')
        self.tag_ptrn = re.compile('<@!?[0-9]+>')
        self.url_ptrn = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

    def analyze(self):
        """
        Return analysis results
        """
        if self.type == 'both':
            return [self.__analyze(), self.__analyze_by_member()]
        elif self.type == 'member':
            return [self.__analyze_by_member()]
        else:
            return [self.__analyze()]

    def __analyze(self):
        """
        Analyze discussion emotions and sentiments
        """
        d = self.get_messages()
        d = self.clean(self.extract_body(d))
        num_of_characters = len(d)
        d = self.split_by_len(d)

        feelings = self.get_feelings(d)
        feelings[0] = ('num_of_characters',) + feelings[0]
        feelings[1] = (num_of_characters,) + feelings[1]
        return feelings

    def __analyze_by_member(self):
        """
        Analyze messages of each member separately
        """
        member_d = self.get_messages(by_member=True)
        results = []
        for member_id, d in member_d.items():
            d = self.clean(self.extract_body(d))
            num_of_characters = len(d)

            if num_of_characters < 1:
                continue

            d = self.split_by_len(d)

            feelings = self.get_feelings(d)

            if not results:
                header = ('member_id',
                          'member_name',
                          'num_of_characters') + feelings[0]
                results.append(header)

            member_name = self.__member_name_by_id(member_id)
            row = (member_id, member_name, num_of_characters) + feelings[1]

            results.append(row)

        return results

    def get_messages(self, by_member=False):
        """
        Return messages from db. If by_member == true,
        return dict where key is a member and value is
        the list of his messages
        """
        q_messages = """SELECT json_build_object(
                            'id',id,'posted_at',posted_at,
                            'content',content, 'member_id', member_id)
                        FROM messages
                        WHERE channel_id = %s
                        AND member_id NOT IN %s
                        AND id >= %s
                        AND id <= %s
                        ORDER BY id"""

        values = (self.channel_id, self.IGNORED_MEMBER_IDS,
                  self.first_id, self.last_id)
        cursor = self.db_client.query(q_messages, values)

        if by_member:
            messages = {}
            message = cursor.fetchone()
            while message:
                message = message[0]
                member_id = str(message['member_id'])

                if member_id in messages:
                    messages[member_id].append(message)
                else:
                    messages[member_id] = [message]

                message = cursor.fetchone()

        else:
            messages = [m[0] for m in cursor.fetchall()]

        return messages

    def extract_body(self, messages):
        """
        Return the body of message
        """
        return ' '.join([m['content'] for m in messages])

    def clean(self, t):
        """
         Clean text from url, tags, emotes, multiple spaces, new line symbols and double quotes
        """
        t = str(t)

        emotes = re.findall(self.custom_emote_ptrn, t)
        for e in emotes:
            e_new = e.split(':')[1].lower()
            t = re.sub(e, '', t)
        t = self.unicode_emote_ptrn.sub('', t)
        t = re.sub(self.url_ptrn, '', t)
        t = re.sub(self.tag_ptrn, '', t)
        t = re.sub('\n', ' ', t)

        t = re.sub('"', '', t)
        t = re.sub('\s+', ' ', t)

        return t.strip()

    def split_by_len(self, text):
        """
        Split text to segments where max length of
        each segment is MAX_PIECE_LEN
        """
        max_len = self.MAX_PIECE_LEN
        current_len = 0
        text_piece = ''
        texts = []
        for w in text.split(' '):
            if (current_len + len(w)) < max_len:
                text_piece += w + ' '
            else:

                texts.append(text_piece.strip())
                text_piece = w + ' '
            current_len = len(text_piece)
        texts.append(text_piece.strip())

        return texts

    def get_feelings(self, texts):
        """
        Return average of feelings values
        """
        t_feelings = {
            'sentiment': {
                's_negative': [],
                's_neutral': [],
                's_positive': []
            },
            'emotion': {
                'e_angry': [],
                'e_excited': [],
                'e_happy': [],
                'e_indifferent': [],
                'e_sad': []
            }
        }

        for t_part in texts:
            sent = sentiment(t_part)
            emot = emotion(t_part)

            t_feelings['sentiment']['s_negative'].append(sent['probabilities']['negative'])
            t_feelings['sentiment']['s_neutral'].append(sent['probabilities']['neutral'])
            t_feelings['sentiment']['s_positive'].append(sent['probabilities']['positive'])

            t_feelings['emotion']['e_angry'].append(emot['probabilities']['angry'])
            t_feelings['emotion']['e_excited'].append(emot['probabilities']['excited'])
            t_feelings['emotion']['e_happy'].append(emot['probabilities']['happy'])
            t_feelings['emotion']['e_indifferent'].append(emot['probabilities']['indifferent'])
            t_feelings['emotion']['e_sad'].append(emot['probabilities']['sad'])

        t_feelings['sentiment'] = (
                self.scale_to_one(
                        self.get_avg_feelings(t_feelings['sentiment']))
                )
        t_feelings['emotion'] = (
                self.scale_to_one(
                    self.get_avg_feelings(t_feelings['emotion']))
                )

        result = [(*t_feelings['sentiment'].keys(),
                   *t_feelings['emotion'].keys()),
                  (*t_feelings['sentiment'].values(),
                   *t_feelings['emotion'].values())
                  ]
        return result

    def get_avg_feelings(self, feelings):
        """
        Return average of feelings values
        """
        len_feelings = len(feelings)
        avg_feelings = {}
        for f, vals in feelings.items():
            avg_feelings[f] = sum([v for v in vals]) / len_feelings

        return avg_feelings

    def scale_to_one(self, feelings):
        """
        Scale values of feelings to
        make their sum equal to 1
        """
        keys = feelings.keys()
        values = np.array(list(feelings.values()))
        values = values / values.sum()

        return dict(zip(keys, values))

    def __member_name_by_id(self, member_id):
        """
        Return member name by his id
        """
        q = "SELECT name FROM members WHERE id = %s"
        cursor = self.db_client.query(q, (member_id,))
        name = cursor.fetchone()[0]

        return name
Пример #7
0
 def __init__(self):
     self.client = PGClient()
     self.limit = settings.RESULT_LIMIT
Пример #8
0
class SQLStats:
    def __init__(self):
        self.client = PGClient()
        self.limit = settings.RESULT_LIMIT

    def query_stats(self, q, values):
        """
        Execute query and return list
        of tuples
        """
        cursor = self.client.query(q, values)
        column_names = list(zip(*cursor.description))[0]

        return [column_names, *cursor.fetchall()]

    def most_reacting(self):
        q = """
            SELECT r.member_id, m.name, count(*)
            FROM reactions r
            JOIN members m ON r.member_id = m.id
            WHERE r.member_id NOT IN %s
            GROUP BY r.member_id, m.name
            ORDER BY count DESC
            LIMIT %s
        """
        return self.query_stats(q, (settings.IGNORED_MEMBER_IDS, self.limit))

    def most_reacted(self, order_by_ratio=False):
        order_by_cols = ['num_of_reactions', 'ratio']
        if order_by_ratio:
            order_by = order_by_cols[1]
        else:
            order_by = order_by_cols[0]

        q = """
            WITH reaction_count AS (
                SELECT m.member_id, count(*) AS num_of_reactions
                FROM reactions r
                JOIN messages m ON m.id = r.message_id
                GROUP BY m.member_id
            ),
                message_count AS (
                SELECT member_id, count(*) as num_of_messages
                FROM messages
                GROUP BY member_id
            )
            SELECT m.name, rc.num_of_reactions, mc.num_of_messages,
                    rc.num_of_reactions / mc.num_of_messages::float as ratio
            FROM reaction_count rc
            JOIN message_count mc ON mc.member_id = rc.member_id
            JOIN members m ON m.id = rc.member_id
            WHERE rc.member_id NOT IN %s
            AND mc.num_of_messages >= 10
            ORDER BY {} DESC
            LIMIT %s
        """.format(order_by)

        return self.query_stats(q, (settings.IGNORED_MEMBER_IDS, self.limit))

    def activity_trend(self):
        q = """
            SELECT extract(month from posted_at)::int AS m,
                   extract(year from posted_at)::int AS y,
                   count(*)
            FROM messages
            WHERE member_id NOT IN %s
            GROUP BY y, m
            ORDER BY y,m
        """

        return self.query_stats(q, (settings.IGNORED_MEMBER_IDS, ))

    def most_used_emotes(self):
        q = """
            WITH _emotes AS (
                SELECT lower(name) AS e_name, sum(count) AS e_count
                FROM emotes
                WHERE member_id NOT IN %s
                GROUP BY e_name
                ),
                _reactions AS (
                    SELECT lower(e.name) as e_name, count(*) AS r_count
                    FROM reactions r
                    JOIN emotes e ON e.emote_id = r.emote_id
                    WHERE r.member_id NOT IN %s
                    GROUP BY e_name
                )
                SELECT e.e_name, e.e_count, r.r_count, (e.e_count + r.r_count) AS total
                FROM _emotes e
                JOIN _reactions r ON r.e_name = e.e_name
                ORDER BY total DESC
                LIMIT %s;
            """
        ignored_m = 2 * (settings.IGNORED_MEMBER_IDS, )
        values = (*ignored_m, self.limit)

        return self.query_stats(q, values)

    def most_active_member(self):
        q = """
        SELECT member_id, mbr.name, count(*)
        FROM messages m
        JOIN members mbr ON mbr.id = m.member_id
        WHERE m.member_id NOT IN %s
        GROUP BY m.member_id, mbr.name
        ORDER BY count DESC
        LIMIT %s
        """

        return self.query_stats(q, (settings.IGNORED_MEMBER_IDS, self.limit))

    def most_mentioned_member(self):
        q = """
        WITH u_mentions AS (
            SELECT unnest(mentions) as _member_id, count(*)
            FROM messages
            GROUP BY _member_id
        )
        SELECT mbr.name, u.* FROM u_mentions u
        JOIN members mbr ON mbr.id = u._member_id
        WHERE u._member_id NOT IN %s
        ORDER BY u.count DESC
        LIMIT %s
        """

        return self.query_stats(q, (settings.IGNORED_MEMBER_IDS, self.limit))