예제 #1
0
    def setup_new_word_cloud(self, preferences):
        """Takes in all the preferences for a word cloud, and writes everything to files"""
        assert 'type' in preferences, "You must pass a type argument"
        assert preferences['type'] in WordCloud.WORD_CLOUD_TYPES, "invalid type, {0} is not in {1}"\
            .format(preferences['type'], WordCloud.WORD_CLOUD_TYPES)

        # sets default values for wc based on type and updates with passed preferences
        wc_type = preferences['type']
        default = WordCloud.get_default_preferences(wc_type)
        for key, val in list(default.items()):
            if key in preferences:
                default[key] = preferences[key]
        preferences = default

        # Creates wordcloud with directory
        self._word_cloud = WordCloud(wc_type, preferences)
        self.save_word_freq(path=WordCloud.WORD_CLOUD_INPUT_PATH)

        # Returns the results from verifying settings for this wordcloud
        return self._word_cloud.verify_word_cloud_setup()
def word_cloud(convo_num):
    current_convo = load_all_gui(convo_num)
    WordCloud.setup_word_cloud_starter_files()
    current_convo.save_word_freq(path=WordCloud.WORD_CLOUD_INPUT_PATH
                                 )  # save conversation files to input dir
    if request.method == 'GET':
        input_word_files = WordCloud.get_input_text_files()
        excluded_word_files = WordCloud.get_excluded_word_files()
        image_files = {
            file:
            WordCloud.get_image_size(WordCloud.WORD_CLOUD_IMAGE_PATH + file)
            for file in WordCloud.get_image_files()
        }
        return render_template(
            'word_clouds.html',
            excluded_word_files=excluded_word_files,
            image_files=image_files,
            input_word_files=input_word_files,
            excluded_word_path=WordCloud.WORD_CLOUD_EXCLUDED_WORDS_PATH,
            input_word_path=WordCloud.WORD_CLOUD_INPUT_PATH,
            image_path=WordCloud.WORD_CLOUD_IMAGE_PATH,
            MAX_COLORS=GUIConvoReader.MAX_NUM_COLORS,
            MAX_LAYERS=GUIConvoReader.MAX_NUM_LAYERS)
    else:
        wc_preferences = {key: val for key, val in request.form.items()}
        ready = current_convo.setup_new_word_cloud(wc_preferences)
        if current_convo.ready_for_word_cloud():
            current_convo.create_word_cloud()
            return redirect(
                '/word_clouds/conversation/{convo_num}/result'.format(
                    convo_num=convo_num))
        else:
            return 'Preferences: {}<br><br>Issues: {}'.format(
                str(wc_preferences), str(ready))
예제 #3
0
    def _setup_new_word_cloud(self, preferences):
        """Takes in all the preferences for a word cloud, and writes everything to files"""
        assert 'type' in preferences, "You must pass a type argument"
        assert preferences['type'] in WordCloud.WORD_CLOUD_TYPES, "invalid type, {0} is not in {1}"\
            .format(preferences['type'], WordCloud.WORD_CLOUD_TYPES)

        # Creates wordcloud with directory
        wc_type = preferences['type']
        self._word_cloud = WordCloud(wc_type, preferences)
        self.save_word_freq(path=WordCloud.WORD_CLOUD_INPUT_PATH)

        # Returns the results from verifying settings for this wordcloud
        return self._word_cloud.verify_word_cloud_setup()
예제 #4
0
    def setup_new_word_cloud(self, preferences):
        """Cleans up data from html form to work with kumo"""
        for key in self.integer_fields():
            if key in preferences and isinstance(preferences[key], str):
                try:
                    preferences[key] = int(preferences[key])
                except ValueError:
                    pass

        preferences['dimensions'] = [
            preferences['width'], preferences['height']
        ]

        if 'excluded_words' in preferences and isinstance(
                preferences['excluded_words'], str):
            if preferences['excluded_words'] == 'None':
                preferences['excluded_words'] = []
            else:
                preferences['excluded_words'] = [preferences['excluded_words']]

        num_colors = preferences['num_colors']
        if num_colors is not None:
            colors = []
            for i in range(1, num_colors + 1):
                colors.append(
                    list(
                        WordCloud.hex_to_rgb(preferences['color{}'.format(
                            str(i))])))
            preferences['colors'] = colors
        else:
            preferences['colors'] = WordCloud.DEFAULT_COLORS

        if 'output_name' in preferences and preferences[
                'output_name'] == 'current_time.png':
            preferences['output_name'] = WordCloud.DEFAULT_OUTPUT_NAME

        if preferences.get('shape') != 'image':
            preferences['image_name'] = 'None'

        if preferences.get('type') == 'layered':
            num_text_sets = preferences['num_layers']
            image_sets, text_sets, color_sets = [], [], []
            for layer in range(1, num_text_sets + 1):
                num_colors = int(preferences['num_colors{}'.format(layer)])
                colors = []
                for i in range(1, num_colors + 1):
                    colors.append(
                        list(
                            WordCloud.hex_to_rgb(
                                preferences['layer{}_color{}'.format(layer,
                                                                     i)])))

                image_sets.append(preferences['image_name{}'.format(layer)])
                text_sets.append(preferences['input_words{}'.format(layer)])
                color_sets.append(colors)

            preferences['image_sets'] = image_sets
            preferences['text_sets'] = text_sets
            preferences['color_sets'] = color_sets

        elif preferences.get('type') == 'polarity':
            color1, color2 = [], []
            for i in range(1, preferences['num_colors1_polarity'] + 1):
                color1.append(
                    list(
                        WordCloud.hex_to_rgb(
                            preferences['polarity1_color{}'.format(str(i))])))
            for i in range(1, preferences['num_colors2_polarity'] + 1):
                color2.append(
                    list(
                        WordCloud.hex_to_rgb(
                            preferences['polarity2_color{}'.format(str(i))])))

            preferences['color_set_1'] = color1
            preferences['color_set_2'] = color2
            preferences['text_set_1'] = preferences['input_words1_polarity']
            preferences['text_set_2'] = preferences['input_words2_polarity']

        # remove unnecessary fields from preferences
        for key in self.removable_fields():
            if key in preferences:
                del preferences[key]

        return BaseConvoReader.setup_new_word_cloud(self, preferences)
예제 #5
0
class BaseConvoReader:
    """Provides base analysis of conversations, extended by ConvoReader and GUIConvoReader classes"""

    BASE_PATH = 'data/conversation_data/'

    def __init__(self, convo_name, convo_list, rank, emojify=True):
        """Parameters:
            convo_name: A string for the conversation name, found in your facebook archive
            convo)list: A 2D list with inner lists of the format [person_name (str), message (str), date-time (str)]
            emojify: A boolean, whether to convert python src encodings for emojis in message to unicode
        """
        self._name = convo_name.lower()
        if emojify:
            self._convo = [[name.lower(), emojis.emojify(msg), CustomDate(date)] for name, msg, date in convo_list]
        else:
            self._convo = [[name.lower(), msg, CustomDate(date)] for name, msg, date in convo_list]
        self._people = self.get_people()
        self._kicked_or_left = [person for person in self._people if person not in self._name.split(', ')]
        self._individual_words = self._cleaned_word_freqs()
        self._len = len(self._convo)
        self._path = BaseConvoReader.BASE_PATH + str(rank) + '/'
        self._word_cloud = None

    # -----------------------------------------------   PUBLIC METHODS ---------------------------------------------- #

    def get_people(self) -> list:
        """Returns a list of lower case names for the individuals in this conversation. Does not include DUPLICATE #X
        if the conversation name includes that.
        """
        if hasattr(self, '_people'):
            return self._people

        duplicate = re.compile("duplicate #\d+", re.IGNORECASE)
        people = []
        for person in sorted(self._name.split(', ')):
            if duplicate.fullmatch(person) is None:
                people.append(person)
        for person, msg, date in self._convo:
            if person.lower() not in people:
                people.append(person.lower())
        return sorted(people)

    def save_word_freq(self, path=None):
        """Saves to a file the ordered rankings of word frequencies by person and aggregate for the chat
        Parameters:
            path (optional): a string path representing the relative path to save files at
        """
        path = self._path if path is None else path
        os.makedirs(path[0:-1], exist_ok=True)
        for person, counter in self._individual_words.items():
            split = person.split()
            pers = ''
            for i in range(len(split) - 1):
                pers += split[i]
                pers += '-'
            pers += split[-1]
            with open(path + pers + '_word_freq.txt', mode='w', encoding='utf-8') as f:
                lst = []
                for key, val in counter.items():
                    lst.append((key, val))
                for key, val in sorted(lst, key=lambda x: x[1], reverse=True):
                    f.write("{0}: {1}".format(key, val) + "\n")
        count = Counter()
        for key, val in self._individual_words.items():
            count += val
        with open(path + 'total.txt', mode='w', encoding='utf-8') as f:
            for key, val in count.most_common():
                f.write("{0}: {1}".format(key, val) + "\n")

    def raw_characters(self, person=None) -> Counter:
        """Returns character frequencies in a Counter object
        Parameters:
            person (optional): The name of the person as a string whose character data you would like. Defaults to None,
                               or data for the aggregate conversation
        Return:
            Counter with keys being characters (as strings) used in the conversation and values being their frequency
        """
        if person is not None:
            assert type(person) is str, "Optional parameter person must be a string"
            person = person.lower()
            assert person in self._people, "{0} isn't in this conversation; this conversation is for" \
                                           " {1}".format(person, str(self._people))
        res = Counter()
        for pers, msg, date in self._convo:
            if person is None or pers == person:
                res.update(msg)
        return res

    def raw_emojis(self, person=None) -> Counter:
        """Returns emojis frequency for the conversation in a Counter object
        Parameter:
            person (optional): the name of the person whose emojis frequencies you would like. If left to default
                None, an aggregate total for the conversation is returned
        Return:
            Counter with keys being emojis (or private use unicode values*) and values being their frequency
        *http://stackoverflow.com/questions/38780324/python3-src-encodings-of-emojis
        """
        chars = self.raw_characters(person=person)
        res = Counter()
        for key, val in chars.most_common():
            if '\\U000' in repr(key) and key is not None:
                try:
                    temp_emoji = emojis.src_to_emoiji(key)
                    if temp_emoji in res:
                        res[temp_emoji] += val
                    else:
                        res[temp_emoji] = val
                except KeyError:
                    res[key] = val
            else:
                for unicode_emoji in emojis.UNICODE_EMOJI:
                    if unicode_emoji == key:
                        if key in res:
                            res[key] += val
                        else:
                            res[key] = val
        return res

    def raw_messages(self, name=None):
        """Returns information about the number of messages in the chat. Return type depends on parameters passed
        Parameters:
            name (optional): The name (as a string) of the person you are interested in. Defaults to None
        Return:
            If name is left to default a Counter object with names of people as keys and the number of messages as
            values. Otherwise an integer representing the number of messages for the person passed
        """
        if name is None:
            return self.__msgs_per_person()
        else:
            return self.__msgs_spoken(name)

    def raw_words(self, name=None):
        """Returns information about the number of words in the chat. Return type depends on parameters passed
        Parameters:
            name (optional): The name (as a string) of the person you are interested in. Defaults to None
        Return:
             If name is left to default a Counter object with names of people as keys and the number of words as values.
             Otherwise an integer representing the number of messages for the person passed
        """

        if name is None:
            return self.__words_per_person()
        else:
            return self.__words_spoken(name)

    def raw_ave_words(self, name=None):
        """Average number of words for people in the chat
        Parameters:
            name (optional): The name (as a string) of the person you are interested in. Defautls to None
        Return:
            If name is left to default a Counter object with names of people as keys and the average words/message as
            values. Otherwise a float representing the average number of words/ message for the person passed.
        """

        if name is None:
            return self.__ave_words_per_person()
        else:
            return self.__ave_words(name)

    def raw_msgs_graph(self, contact=None, forward_shift=0) -> list:
        """The raw data used by print_msgs_graph to display message graphs
        Parameters:
            contact (optional): the name (as a string) of the person you are interested in
                (default: all contacts)
            forward_shift (optional): The number of minutes past 12 midnight that should count as the previous day
        Return:
            A 2D list with inner lists being of the form [ CustomDate(), num-messages]. The CustomDate object represents
            12:00am of a date, and num-messages is the integer number of messages sent and/ or received that day.
        """
        contact = self._assert_contact(contact)
        assert isinstance(forward_shift, int), "Forward shift must be an integer"
        assert -60 * 24 < forward_shift < 60 * 24, "Forward shift must be between {0} and {1}, not including them" \
            .format(-60 * 24, 60 * 24)

        if contact is not None:
            filt = lambda x: x in contact
        else:
            filt = lambda x: True

        start = self._convo[0][2]
        end = self._convo[-1][2]
        days = end - start

        msg_freq = [[None, 0] for i in range(days + 1)]
        for person, msg, date in self._convo:
            if filt(person.lower()):
                if date.minutes() < forward_shift:  # if we are counting this time as the previous day
                    msg_freq[max(0, date - start - 1)][1] += 1
                else:  # this time is ahead of the shift, so it is counted as the right day
                    msg_freq[date - start][1] += 1

        for day in range(len(msg_freq)):
            msg_freq[day][0] = CustomDate.from_date(start + day)

        return msg_freq

    def raw_msgs_by_weekday(self, contact=None, percent=True) -> list:
        """Returns the frequency of chatting by days of week
            Parameters:
                contact (optional): (str|None) a string representation of the contact whose data you would like, or if
                                 left to default None data for the entire conversation
                percent (optional): (boolean) Whether to return a percent frequency or raw frequency list
            Return:
                A list containing frequency of chatting by days of week, ordered by index,
                with [0] being Monday, [1] Tuesday.. and [6] Sunday
        """
        contact = self._assert_contact(contact)

        if contact is None:
            key = lambda x: True
        else:
            key = lambda x: x in contact

        weekday_freq = [0 for _ in range(7)]
        for p, m, d in self._convo:
            if key(p):
                weekday_freq[d.weekday()] += 1

        if percent:  # return a percentage of messages by day
            weekday_total = sum(weekday_freq)
            if weekday_total == 0:  # If this conversation has no messages
                return [0 for _ in weekday_freq]
            return [day / weekday_total for day in weekday_freq]
        else:
            return weekday_freq

    def raw_msgs_by_time(self, window=60, contact=None) -> list:
        """The percent of conversation by time of day
        Parameters:
            window (optional): The time length of each bin in minutes (default, 60 minutes, or 1 hour)
            contact (optional): The contact you are interested in. (default, all contacts)
        Return:
            a list containing average frequency of chatting by times in days, starting at 12:00 am. Default window
            is 60 minute interval. If time less than the passed window is left at the end, it is put at the end of
            the list in it's own window. e.g. if window=60, the  list returned is of length 24, with each index
                                                    representing one hour (60 minutes) of chatting
                                              if window=61 the list returned is still of length 24, but indexes
                                                    0-22 representing 61 minutes, and index 23 representing 37 minutes
        """
        contact = self._assert_contact(contact)

        if contact is not None:
            filt = lambda x: x in contact
        else:
            filt = lambda x: True

        msg_bucket = [[CustomDate.minutes_to_time(i * window), 0] for i in range(ceil(60 * 24 / window))]

        for person, msg, date in self._convo:
            if filt(person.lower()):
                index = (date.minutes() // window) % (len(msg_bucket))
                msg_bucket[index][1] += 1
        for i in range(len(msg_bucket)):
            msg_bucket[i][1] /= (len(self) / 100)
        return msg_bucket

    def raw_frequency(self, person=None, word=None):
        """Frequency of word use for people in the chat
        Parameters:
            person (optional): The name (as a string) of the person you are interested in
            word (optional): The word (as a string) you are interested in
        Return:
            There are 4 different return types depending on the arguments passed:
            Yes person and Yes word: the number of times the specified person has said the specified word
            Yes person and No word: A counter object of words mapped to their frequency for the specified person
            No person and Yes word: The number of times the specified word has been said by anyone in the chat
            No person and No word: A dictionary with keys being the names of people in the conversation
                and values being counter objects of words mapped to their frequency
        """
        if person is not None:
            person = person.lower()
            assert person in self._name, "\"{0}\" is not in this conversation".format(person.title())
        if word is not None:
            word = word.lower()
        if person is not None:
            try:
                if word is not None:
                    return self._individual_words[person][word]
                else:
                    return self._individual_words[person]
            except KeyError:
                return Counter() if word is None else 0
        else:
            if word is not None:
                res = 0
                for key, val in self._individual_words.items():
                    res += self._individual_words[key][word]
                return res
            else:
                return self._individual_words

    def raw_convo_starter_freqs(self, threshold) -> Counter:
        """Returns the frequency that each participant begins conversations as percents
        Parameter:
            threshold: the number of minutes lag that counts as the threshold for starting a new conversation.
        Return:
            A Counter of names mapped to frequencies of conversation starting (as percents)
        """
        raw_freqs = self._raw_convo_starter(threshold)
        total = sum(len(freq) for _, freq in raw_freqs.items())
        if total == 0:
            return raw_freqs
        res = Counter()
        for key, freq in raw_freqs.items():
            res[key] = len(freq) / total * 100
        return res

    def raw_convo_killer_freqs(self, threshold) -> Counter:
        """Returns the frequency (as percents) that each participant 'kills' conversations, where killing is defined as
        being the last person to send a message with no replies for at least threshold minutes
        Parameters:
            threshold: the number of minutes lag that counts as the threshold for starting a new conversation.
        Return:
            A Counter of names mapped to frequencies of conversation killing (as percents)
        """
        raw_freqs = self._raw_convo_killer(threshold)
        total = sum(len(freq) for _, freq in raw_freqs.items())
        if total == 0:
            return raw_freqs
        res = Counter()
        for key, freq in raw_freqs.items():
            res[key] = len(freq) / total * 100
        return res

    def raw_find_indexes(self, query, ignore_case=False) -> list:
        """Returns a list with the indexes of each message that contain the passed message
        Parameters:
            query: The string query to search for
            ignore_case (optional): Whether to search by case sensitive
        Return:
            A list sorted list of indexes for messages containing query
        """
        # python re cheat sheet: https://www.debuggex.com/cheatsheet/regex/python

        assert isinstance(query, str), "query must be a string"
        key = lambda x: x
        if ignore_case:
            key = lambda x: x.lower()
        indexes = []
        for i in range(len(self._convo)):
            if query in key(self._convo[i][1]):
                indexes.append(i)
        return indexes

    def raw_match_indexes(self, query, ignore_case=False) -> list:
        """Returns a list with the indexes of each message that match the passed message
        Parameters:
            query: The string query to search for
            ignore_case (optional): Whether to search by case sensitive
        Return:
            A list sorted list of indexes for messages exactly matching query
        """
        # python re cheat sheet: https://www.debuggex.com/cheatsheet/regex/python

        indexes = []
        try:
            r = re.compile(query, re.IGNORECASE) if ignore_case else re.compile(query)
            for i in range(len(self._convo)):
                if r.fullmatch(self._convo[i][1]) is not None:
                    indexes.append(i)
            return indexes
        except re.error:
            raise re.error("\"{0}\" is not a valid regex string".format(query))

    def raw_longest_messages(self, num=None) -> list:
        """Returns a list of integers corresponding to message indexes, sorted in reverse order based on length (longest
        message index first)
        Parameters:
            num (optional): (int|None) the number of messages to include, or None to include all
        """
        assert num is None or isinstance(num, int), (
            "num must be None or an integer representing the number of messages desired"
        )
        num = min(num, len(self)) if num is not None else len(self)

        order = Counter({index: len(self._convo[index][1]) for index in range(len(self))})
        return order.most_common(num)

    # -----------------------------------------------   PUBLIC METHODS ---------------------------------------------- #

    # -----------------------------------------------   PRIVATE METHODS ---------------------------------------------- #

    @staticmethod
    def list_to_combined_string(list_of_people):
        """Combines a list of people into a single string, converting each perons's name to lowercase, separating
        first/middle/last name(s) with hyphens (-) and various individual's names with underscores (_). Cuts off a
        string past 255 raw_characters
        """
        name = ""
        for person in list_of_people:
            split = person.split(' ')
            for i in range(len(split) - 1):
                name += split[i]
                name += '-'
            name += split[-1]
            name += '_'
        name = name[:-1]
        if len(name) > 255:
            name = name[:255]
        else:
            name = name

        return name + '/'

    def _setup_new_word_cloud(self, preferences):
        """Takes in all the preferences for a word cloud, and writes everything to files"""
        assert 'type' in preferences, "You must pass a type argument"
        assert preferences['type'] in WordCloud.WORD_CLOUD_TYPES, "invalid type, {0} is not in {1}"\
            .format(preferences['type'], WordCloud.WORD_CLOUD_TYPES)

        # Creates wordcloud with directory
        wc_type = preferences['type']
        self._word_cloud = WordCloud(wc_type, preferences)
        self.save_word_freq(path=WordCloud.WORD_CLOUD_INPUT_PATH)

        # Returns the results from verifying settings for this wordcloud
        return self._word_cloud.verify_word_cloud_setup()

    def _raw_convo_starter(self, threshold, start=None, end=None):
        """Returns a Counter"""
        CustomDate.assert_dates(start, end)

        # Sets the start and end dates, finds the appropriate
        #  message number if start/ end are not None, else index 1 for start and len(convo) for end
        start_date_index = CustomDate.bsearch_index(self._convo, start, key=lambda x: x[2]) if start is not None else 1
        end_date_index = CustomDate.bsearch_index(self._convo, end, key=lambda x: x[2]) \
            if start is not None else self._len

        convo_start_freq = dict()
        for person in self._people:
            convo_start_freq[person] = []
        convo_start_freq[self._convo[start_date_index - 1][0]].append(start_date_index - 1)
        for i in range(start_date_index, end_date_index):
            curr_date = self._convo[i][2]
            prev_date = self._convo[i - 1][2]
            if curr_date.distance_from(prev_date) >= threshold:
                convo_start_freq[self._convo[i][0]].append(i)
        return Counter(dict((key.title(), val) for key, val in convo_start_freq.items()))

    def _raw_convo_killer(self, threshold, start=None, end=None):
        """Returns a Counter"""
        CustomDate.assert_dates(start, end)

        # Sets the start and end dates, finds the appropriate
        #  message number if start/ end are not None, else index 1 for start and len(convo) for end
        start_date_index = CustomDate.bsearch_index(self._convo, start, key=lambda x: x[2]) if start is not None else 0
        end_date_index = CustomDate.bsearch_index(self._convo, end, key=lambda x: x[2]) \
            if start is not None else self._len - 1

        convo_start_freq = dict()
        for person in self._people:
            convo_start_freq[person] = []
        convo_start_freq[self._convo[start_date_index - 1][0]].append(start_date_index - 1)
        for i in range(start_date_index, end_date_index):
            curr_date = self._convo[i][2]
            next_date = self._convo[i + 1][2]
            if next_date.distance_from(curr_date) >= threshold:
                convo_start_freq[self._convo[i][0]].append(i)
        return Counter(dict((key.title(), val) for key, val in convo_start_freq.items()))

    def _raw_word_freqs(self):
        """Returns a dictionary that maps names of people in the conversation
        to a Counter object of their raw word frequencies
        """
        raw_word_freq = dict()
        for person, msg, date in self._convo:
            if person not in raw_word_freq:
                raw_word_freq[person] = Counter()
            raw_word_freq[person].update(msg.lower().split(' '))
        return raw_word_freq

    def _cleaned_word_freqs(self):
        raw_words = self._raw_word_freqs()
        cleaned_words = dict()
        for key, val in raw_words.items():
            cleaned_words[key] = Counter()
            for word, freq in val.most_common():
                striped_word = word.strip('.!123456789-+?><}{][()\'\""\\ /*#$%^&#@,')
                if striped_word < 'z' * 10:
                    if '.com' not in striped_word and 'www.' not in striped_word \
                            and 'http' not in striped_word and '.io' not in striped_word \
                            and '.edu' not in striped_word:
                        if striped_word not in cleaned_words[key]:
                            cleaned_words[key][striped_word] = freq
                        else:
                            cleaned_words[key][striped_word] += freq
        return cleaned_words

    def __msgs_per_person(self):
        res = dict()
        for person, msg, date in self._convo:
            if person not in res:
                res[person] = 1
            else:
                res[person] += 1
        return Counter(res)

    def __msgs_spoken(self, name):
        name = name.lower()
        if name not in self._people:
            raise Exception("Invalid name passed")
        num = 0
        for person, msg, date in self._convo:
            if person == name:
                num += 1
        return num

    def __words_per_person(self):
        res = dict()
        for person, msg, date in self._convo:
            if person not in res:
                res[person] = len(msg.split())
            else:
                res[person] += len(msg.split())
        return Counter(res)

    def __words_spoken(self, name):
        name = name.lower()
        if name not in self._people:
            raise Exception("Invalid name passed")
        num = 0
        for person, msg, date in self._convo:
            if person == name:
                num += len(msg.split())
        return num

    def __ave_words_per_person(self):
        words = []
        for name in self._people:
            msgs = float(self.__msgs_spoken(name))
            tot_words = float(self.__words_spoken(name))
            if msgs > 0:
                words.append((name, tot_words / msgs))
        res = Counter()
        for name, ave in words:
            res[name] = ave
        return res

    def __ave_words(self, name):
        name = name.lower()
        if name not in self._people:
            return -1
        if self.__msgs_spoken(name) == 0:
            return 0
        return self.__words_spoken(name) / self.__msgs_spoken(name)

    def _assert_contact(self, contact):
        assert type(contact) in [type(None), str, list], "Contact must be of type string or a list of strings"
        if type(contact) is list:
            for i, ele in enumerate(contact):
                assert type(ele) is str, "Each element in contact must be a string"
                contact[i] = ele.lower()
            for ele in contact:
                assert ele in self._people, "{0} is not in the list of people for this conversation:\n{1}".format(
                    ele, str(self._people))
        elif type(contact) is str:
            assert contact in self._people, "{0} is not in the list of people for this conversation:\n{1}".format(
                contact, str(self._people))
            contact = [contact]

        return contact

    # -----------------------------------------------   PRIVATE METHODS ---------------------------------------------- #

    def __getitem__(self, index):
        """Returns the tuple (person, message, datetime) for the corresponding index"""
        if type(index) is not int:
            raise TypeError
        elif index >= len(self) or index < -len(self):
            raise IndexError
        else:
            return self._convo[index] if index >= 0 else self._convo[len(self) + index]

    def __len__(self):
        """Returns the number of messages in self"""
        return self._len

    def __str__(self):
        """Returns a string with the alphabetically sorted names of people
        in this conversation
        """
        return "Conversation for " + self._name.title()

    def __iter__(self):
        return (message for message in self._convo)
예제 #6
0
class BaseConvoReader:
    """Provides base analysis of conversations, extended by ConvoReader and GUIConvoReader classes"""

    BASE_PATH = 'data/conversation_data/'

    def __init__(self, convo_name, convo_list, rank):
        """Parameters:
            convo_name: A string for the conversation name, found in your facebook archive
            convo_list: A 2D list with inner lists of the format [person_name (str), message (str), date-time (str)]
        """
        self._name = convo_name.lower()
        self._convo = [[name.lower(), msg, CustomDate(date)] for name, msg, date in convo_list]
        self._people = self.get_people()
        self._kicked_or_left = [person for person in self._people if person not in self._name.split(', ')]
        self._individual_words = self._cleaned_word_freqs()
        self._len = len(self._convo)
        self._path = BaseConvoReader.BASE_PATH + str(rank) + '/'
        self._word_cloud = None

    # -----------------------------------------------   PUBLIC METHODS ---------------------------------------------- #

    def get_people(self) -> list:
        """Returns a list of lower case names for the individuals in this conversation. Does not include DUPLICATE #X
        if the conversation name includes that.
        """
        if hasattr(self, '_people'):
            return self._people

        duplicate = re.compile("duplicate #\d+", re.IGNORECASE)
        people = []
        for person in sorted(self._name.split(', ')):
            if duplicate.fullmatch(person) is None:
                people.append(person)
        for person, msg, date in self._convo:
            if person.lower() not in people:
                people.append(person.lower())
        return sorted(people)

    def save_word_freq(self, path=None):
        """Saves to a file the ordered rankings of word frequencies by person and aggregate for the chat
        Parameters:
            path (optional): a string path representing the relative path to save files at
        """
        path = self._path if path is None else path
        os.makedirs(path[0:-1], exist_ok=True)
        for person, counter in self._individual_words.items():
            split = person.split()
            pers = ''
            for i in range(len(split) - 1):
                pers += split[i]
                pers += '-'
            pers += split[-1]
            with open(path + pers + '_word_freq.txt', mode='w', encoding='utf-8') as f:
                lst = []
                for key, val in counter.items():
                    lst.append((key, val))
                for key, val in sorted(lst, key=lambda x: x[1], reverse=True):
                    f.write("{0}: {1}".format(key, val) + "\n")
        count = Counter()
        for key, val in self._individual_words.items():
            count += val
        with open(path + 'total.txt', mode='w', encoding='utf-8') as f:
            for key, val in count.most_common():
                f.write("{0}: {1}".format(key, val) + "\n")

    def raw_characters(self, person=None) -> Counter:
        """Returns character frequencies in a Counter object
        Parameters:
            person (optional): The name of the person as a string whose character data you would like. Defaults to None,
                               or data for the aggregate conversation
        Return:
            Counter with keys being characters (as strings) used in the conversation and values being their frequency
        """
        if person is not None:
            assert type(person) is str, "Optional parameter person must be a string"
            person = person.lower()
            assert person in self._people, "{0} isn't in this conversation; this conversation is for" \
                                           " {1}".format(person, str(self._people))
        res = Counter()
        for pers, msg, date in self._convo:
            if person is None or pers == person:
                res.update(msg)
        return res

    def raw_emojis(self, person=None) -> Counter:
        """Returns emojis frequency for the conversation in a Counter object
        Parameter:
            person (optional): the name of the person whose emojis frequencies you would like. If left to default
                None, an aggregate total for the conversation is returned
        Return:
            Counter with keys being emojis (or private use unicode values*) and values being their frequency
        *http://stackoverflow.com/questions/38780324/python3-src-encodings-of-emojis
        """
        chars = self.raw_characters(person=person)
        res = Counter()
        for key, val in chars.most_common():
            if '\\U000' in repr(key) and key is not None:
                try:
                    temp_emoji = emojis.src_to_emoiji(key)
                    if temp_emoji in res:
                        res[temp_emoji] += val
                    else:
                        res[temp_emoji] = val
                except KeyError:
                    res[key] = val
            else:
                for unicode_emoji in emojis.UNICODE_EMOJI:
                    if unicode_emoji == key:
                        if key in res:
                            res[key] += val
                        else:
                            res[key] = val
        return res

    def raw_messages(self, name=None):
        """Returns information about the number of messages in the chat. Return type depends on parameters passed
        Parameters:
            name (optional): The name (as a string) of the person you are interested in. Defaults to None
        Return:
            If name is left to default a Counter object with names of people as keys and the number of messages as
            values. Otherwise an integer representing the number of messages for the person passed
        """
        if name is None:
            return self.__msgs_per_person()
        else:
            return self.__msgs_spoken(name)

    def raw_words(self, name=None):
        """Returns information about the number of words in the chat. Return type depends on parameters passed
        Parameters:
            name (optional): The name (as a string) of the person you are interested in. Defaults to None
        Return:
             If name is left to default a Counter object with names of people as keys and the number of words as values.
             Otherwise an integer representing the number of messages for the person passed
        """

        if name is None:
            return self.__words_per_person()
        else:
            return self.__words_spoken(name)

    def raw_ave_words(self, name=None):
        """Average number of words for people in the chat
        Parameters:
            name (optional): The name (as a string) of the person you are interested in. Defautls to None
        Return:
            If name is left to default a Counter object with names of people as keys and the average words/message as
            values. Otherwise a float representing the average number of words/ message for the person passed.
        """

        if name is None:
            return self.__ave_words_per_person()
        else:
            return self.__ave_words(name)

    def raw_msgs_graph(self, contact=None, forward_shift=0) -> list:
        """The raw data used by print_msgs_graph to display message graphs
        Parameters:
            contact (optional): the name (as a string) of the person you are interested in
                (default: all contacts)
            forward_shift (optional): The number of minutes past 12 midnight that should count as the previous day
        Return:
            A 2D list with inner lists being of the form [ CustomDate(), num-messages]. The CustomDate object represents
            12:00am of a date, and num-messages is the integer number of messages sent and/ or received that day.
        """
        contact = self._assert_contact(contact)
        assert isinstance(forward_shift, int), "Forward shift must be an integer"
        assert -60 * 24 < forward_shift < 60 * 24, "Forward shift must be between {0} and {1}, not including them" \
            .format(-60 * 24, 60 * 24)

        if contact is not None:
            filt = lambda x: x in contact
        else:
            filt = lambda x: True

        start = self._convo[0][2]
        end = self._convo[-1][2]
        days = end - start

        msg_freq = [[None, 0] for i in range(days + 1)]
        for person, msg, date in self._convo:
            if filt(person.lower()):
                if date.minutes() < forward_shift:  # if we are counting this time as the previous day
                    msg_freq[max(0, date - start - 1)][1] += 1
                else:  # this time is ahead of the shift, so it is counted as the right day
                    msg_freq[date - start][1] += 1

        for day in range(len(msg_freq)):
            msg_freq[day][0] = CustomDate.from_date(start + day)

        return msg_freq

    def raw_msgs_by_weekday(self, contact=None, percent=True) -> list:
        """Returns the frequency of chatting by days of week
            Parameters:
                contact (optional): (str|None) a string representation of the contact whose data you would like, or if
                                 left to default None data for the entire conversation
                percent (optional): (boolean) Whether to return a percent frequency or raw frequency list
            Return:
                A list containing frequency of chatting by days of week, ordered by index,
                with [0] being Monday, [1] Tuesday.. and [6] Sunday
        """
        contact = self._assert_contact(contact)

        if contact is None:
            key = lambda x: True
        else:
            key = lambda x: x in contact

        weekday_freq = [0 for _ in range(7)]
        for p, m, d in self._convo:
            if key(p):
                weekday_freq[d.weekday()] += 1

        if percent:  # return a percentage of messages by day
            weekday_total = sum(weekday_freq)
            if weekday_total == 0:  # If this conversation has no messages
                return [0 for _ in weekday_freq]
            return [day / weekday_total for day in weekday_freq]
        else:
            return weekday_freq

    def raw_msgs_by_time(self, window=60, contact=None) -> list:
        """The percent of conversation by time of day
        Parameters:
            window (optional): The time length of each bin in minutes (default, 60 minutes, or 1 hour)
            contact (optional): The contact you are interested in. (default, all contacts)
        Return:
            a list containing average frequency of chatting by times in days, starting at 12:00 am. Default window
            is 60 minute interval. If time less than the passed window is left at the end, it is put at the end of
            the list in it's own window. e.g. if window=60, the  list returned is of length 24, with each index
                                                    representing one hour (60 minutes) of chatting
                                              if window=61 the list returned is still of length 24, but indexes
                                                    0-22 representing 61 minutes, and index 23 representing 37 minutes
        """
        contact = self._assert_contact(contact)

        if contact is not None:
            filt = lambda x: x in contact
        else:
            filt = lambda x: True

        msg_bucket = [[CustomDate.minutes_to_time(i * window), 0] for i in range(ceil(60 * 24 / window))]

        for person, msg, date in self._convo:
            if filt(person.lower()):
                index = (date.minutes() // window) % (len(msg_bucket))
                msg_bucket[index][1] += 1
        for i in range(len(msg_bucket)):
            msg_bucket[i][1] /= (len(self) / 100)
        return msg_bucket

    def raw_frequency(self, person=None, word=None):
        """Frequency of word use for people in the chat
        Parameters:
            person (optional): The name (as a string) of the person you are interested in
            word (optional): The word (as a string) you are interested in
        Return:
            There are 4 different return types depending on the arguments passed:
            Yes person and Yes word: the number of times the specified person has said the specified word
            Yes person and No word: A counter object of words mapped to their frequency for the specified person
            No person and Yes word: The number of times the specified word has been said by anyone in the chat
            No person and No word: A dictionary with keys being the names of people in the conversation
                and values being counter objects of words mapped to their frequency
        """
        if person is not None:
            person = person.lower()
            assert person in self._name, "\"{0}\" is not in this conversation".format(person.title())
        if word is not None:
            word = word.lower()
        if person is not None:
            try:
                if word is not None:
                    return self._individual_words[person][word]
                else:
                    return self._individual_words[person]
            except KeyError:
                return Counter() if word is None else 0
        else:
            if word is not None:
                res = 0
                for key, val in self._individual_words.items():
                    res += self._individual_words[key][word]
                return res
            else:
                return self._individual_words

    def raw_convo_starter_freqs(self, threshold) -> Counter:
        """Returns the frequency that each participant begins conversations as percents
        Parameter:
            threshold: the number of minutes lag that counts as the threshold for starting a new conversation.
        Return:
            A Counter of names mapped to frequencies of conversation starting (as percents)
        """
        raw_freqs = self._raw_convo_starter(threshold)
        total = sum(len(freq) for _, freq in raw_freqs.items())
        if total == 0:
            return raw_freqs
        res = Counter()
        for key, freq in raw_freqs.items():
            res[key] = len(freq) / total * 100
        return res

    def raw_convo_killer_freqs(self, threshold) -> Counter:
        """Returns the frequency (as percents) that each participant 'kills' conversations, where killing is defined as
        being the last person to send a message with no replies for at least threshold minutes
        Parameters:
            threshold: the number of minutes lag that counts as the threshold for starting a new conversation.
        Return:
            A Counter of names mapped to frequencies of conversation killing (as percents)
        """
        raw_freqs = self._raw_convo_killer(threshold)
        total = sum(len(freq) for _, freq in raw_freqs.items())
        if total == 0:
            return raw_freqs
        res = Counter()
        for key, freq in raw_freqs.items():
            res[key] = len(freq) / total * 100
        return res

    def raw_find_indexes(self, query, ignore_case=False) -> list:
        """Returns a list with the indexes of each message that contain the passed message
        Parameters:
            query: The string query to search for
            ignore_case (optional): Whether to search by case sensitive
        Return:
            A list sorted list of indexes for messages containing query
        """
        # python re cheat sheet: https://www.debuggex.com/cheatsheet/regex/python

        assert isinstance(query, str), "query must be a string"
        key = lambda x: x
        if ignore_case:
            key = lambda x: x.lower()
            query = query.lower()
        indexes = []
        for i in range(len(self._convo)):
            if query in key(self._convo[i][1]):
                indexes.append(i)
        return indexes

    def raw_match_indexes(self, query, ignore_case=False) -> list:
        """Returns a list with the indexes of each message that match the passed message
        Parameters:
            query: The string query to search for
            ignore_case (optional): Whether to search by case sensitive
        Return:
            A list sorted list of indexes for messages exactly matching query
        """
        # python re cheat sheet: https://www.debuggex.com/cheatsheet/regex/python

        indexes = []
        try:
            r = re.compile(query, re.IGNORECASE) if ignore_case else re.compile(query)
            for i in range(len(self._convo)):
                if r.fullmatch(self._convo[i][1]) is not None:
                    indexes.append(i)
            return indexes
        except re.error:
            raise re.error("\"{0}\" is not a valid regex string".format(query))

    def raw_fuzzy_match_indexes(self, query, ignore_case=False, junk=' ', min_ratio=0.6):
        """Returns a list with the indexes of each message that match the passed message with at least min_ratio
        Parameters:
            query (str): The text to search for
            ignore_case (bool): Whether or not to ignore case
            junk (str): A string in which each character is treated as "junk" and is ignored for the sake of computing
                        a similarity/difference score
            min_ratio (double): The minimum similarity ratio needed to consider a string as "matched". Between 0 and 1
                                with 0 being no match and 1 being a full match
        """
        assert isinstance(query, str), "query must be a string. Received {}: {}".format(type(query), str(query))
        assert isinstance(ignore_case, bool), "ignore_case must be a boolean. Received {}: {}"\
            .format(type(ignore_case), str(ignore_case))
        assert isinstance(junk, str), "junk must be a string. Received {}: {}".format(type(junk), str(junk))
        assert isinstance(min_ratio, float) or isinstance(min_ratio, int), (
            "min_ratio must be a float or int. Received {}: {}".format(type(min_ratio), str(min_ratio))
        )
        assert 0 <= min_ratio <= 1, "min_ratio must be between 0 and 1 (inclusive)"

        if ignore_case:
            query = query.lower()
            clean_msg = lambda x: x.lower()
        else:
            clean_msg = lambda x: x
        isjunk = lambda string: string in junk
        close_enough = lambda str_to_match: SequenceMatcher(isjunk, query, str_to_match).ratio() >= min_ratio
        matched = []
        for i in range(len(self)):
            if close_enough(clean_msg(self._convo[i][1])):
                matched.append(i)
        return matched

    def raw_longest_messages(self, num=None) -> list:
        """Returns a list of integers corresponding to message indexes, sorted in reverse order based on length (longest
        message index first)
        Parameters:
            num (optional): (int|None) the number of messages to include, or None to include all
        """
        assert num is None or isinstance(num, int), (
            "num must be None or an integer representing the number of messages desired"
        )
        num = min(num, len(self)) if num is not None else len(self)

        order = Counter({index: len(self._convo[index][1]) for index in range(len(self))})
        return order.most_common(num)

    # -----------------------------------------------   PUBLIC METHODS ---------------------------------------------- #

    # -----------------------------------------------   PRIVATE METHODS ---------------------------------------------- #

    @staticmethod
    def _start_kumo():
        """Calls the java program, assuming that all conditions are met"""
        # grabbed from http://stackoverflow.com/questions/438594/how-to-call-java-objects-and-functions-from-cpython
        # with additions by http://stackoverflow.com/questions/11269575/how-to-hide-output-of-subprocess-in-python-2-7k
        # devnull = open(os.devnull, mode='w')
        p = subprocess.Popen("java -jar data/word_clouds/wordclouds.jar", shell=True)
        sts = os.waitpid(p.pid, 0)

    @staticmethod
    def list_to_combined_string(list_of_people):
        """Combines a list of people into a single string, converting each perons's name to lowercase, separating
        first/middle/last name(s) with hyphens (-) and various individual's names with underscores (_). Cuts off a
        string past 255 raw_characters
        """
        name = ""
        for person in list_of_people:
            split = person.split(' ')
            for i in range(len(split) - 1):
                name += split[i]
                name += '-'
            name += split[-1]
            name += '_'
        name = name[:-1]
        if len(name) > 255:
            name = name[:255]
        else:
            name = name

        return name + '/'

    def setup_new_word_cloud(self, preferences):
        """Takes in all the preferences for a word cloud, and writes everything to files"""
        assert 'type' in preferences, "You must pass a type argument"
        assert preferences['type'] in WordCloud.WORD_CLOUD_TYPES, "invalid type, {0} is not in {1}"\
            .format(preferences['type'], WordCloud.WORD_CLOUD_TYPES)

        # sets default values for wc based on type and updates with passed preferences
        wc_type = preferences['type']
        default = WordCloud.get_default_preferences(wc_type)
        for key, val in list(default.items()):
            if key in preferences:
                default[key] = preferences[key]
        preferences = default

        # Creates wordcloud with directory
        self._word_cloud = WordCloud(wc_type, preferences)
        self.save_word_freq(path=WordCloud.WORD_CLOUD_INPUT_PATH)

        # Returns the results from verifying settings for this wordcloud
        return self._word_cloud.verify_word_cloud_setup()

    def _raw_convo_starter(self, threshold, start=None, end=None):
        """Returns a Counter"""
        CustomDate.assert_dates(start, end)

        # Sets the start and end dates, finds the appropriate
        #  message number if start/ end are not None, else index 1 for start and len(convo) for end
        start_date_index = CustomDate.bsearch_index(self._convo, start, key=lambda x: x[2]) if start is not None else 1
        end_date_index = CustomDate.bsearch_index(self._convo, end, key=lambda x: x[2]) \
            if start is not None else self._len

        convo_start_freq = dict()
        for person in self._people:
            convo_start_freq[person] = []
        convo_start_freq[self._convo[start_date_index - 1][0]].append(start_date_index - 1)
        for i in range(start_date_index, end_date_index):
            curr_date = self._convo[i][2]
            prev_date = self._convo[i - 1][2]
            if curr_date.distance_from(prev_date) >= threshold:
                convo_start_freq[self._convo[i][0]].append(i)
        return Counter(dict((key.title(), val) for key, val in convo_start_freq.items()))

    def _raw_convo_killer(self, threshold, start=None, end=None):
        """Returns a Counter"""
        CustomDate.assert_dates(start, end)

        # Sets the start and end dates, finds the appropriate
        #  message number if start/ end are not None, else index 1 for start and len(convo) for end
        start_date_index = CustomDate.bsearch_index(self._convo, start, key=lambda x: x[2]) if start is not None else 0
        end_date_index = CustomDate.bsearch_index(self._convo, end, key=lambda x: x[2]) \
            if start is not None else self._len - 1

        convo_start_freq = dict()
        for person in self._people:
            convo_start_freq[person] = []
        convo_start_freq[self._convo[start_date_index - 1][0]].append(start_date_index - 1)
        for i in range(start_date_index, end_date_index):
            curr_date = self._convo[i][2]
            next_date = self._convo[i + 1][2]
            if next_date.distance_from(curr_date) >= threshold:
                convo_start_freq[self._convo[i][0]].append(i)
        return Counter(dict((key.title(), val) for key, val in convo_start_freq.items()))

    def _raw_word_freqs(self):
        """Returns a dictionary that maps names of people in the conversation
        to a Counter object of their raw word frequencies
        """
        raw_word_freq = dict()
        for person, msg, date in self._convo:
            if person not in raw_word_freq:
                raw_word_freq[person] = Counter()
            raw_word_freq[person].update(msg.lower().split(' '))
        return raw_word_freq

    def _cleaned_word_freqs(self):
        raw_words = self._raw_word_freqs()
        cleaned_words = dict()
        for key, val in raw_words.items():
            cleaned_words[key] = Counter()
            for word, freq in val.most_common():
                striped_word = word.strip('.!123456789-+?><}{][()\'\""\\ /*#$%^&#@,')
                if striped_word < 'z' * 10:
                    if '.com' not in striped_word and 'www.' not in striped_word \
                            and 'http' not in striped_word and '.io' not in striped_word \
                            and '.edu' not in striped_word:
                        if striped_word not in cleaned_words[key]:
                            cleaned_words[key][striped_word] = freq
                        else:
                            cleaned_words[key][striped_word] += freq
        return cleaned_words

    def __msgs_per_person(self):
        res = dict()
        for person, msg, date in self._convo:
            if person not in res:
                res[person] = 1
            else:
                res[person] += 1
        return Counter(res)

    def __msgs_spoken(self, name):
        name = name.lower()
        if name not in self._people:
            raise Exception("Invalid name passed")
        num = 0
        for person, msg, date in self._convo:
            if person == name:
                num += 1
        return num

    def __words_per_person(self):
        res = dict()
        for person, msg, date in self._convo:
            if person not in res:
                res[person] = len(msg.split())
            else:
                res[person] += len(msg.split())
        return Counter(res)

    def __words_spoken(self, name):
        name = name.lower()
        if name not in self._people:
            raise Exception("Invalid name passed")
        num = 0
        for person, msg, date in self._convo:
            if person == name:
                num += len(msg.split())
        return num

    def __ave_words_per_person(self):
        words = []
        for name in self._people:
            msgs = float(self.__msgs_spoken(name))
            tot_words = float(self.__words_spoken(name))
            if msgs > 0:
                words.append((name, tot_words / msgs))
        res = Counter()
        for name, ave in words:
            res[name] = ave
        return res

    def __ave_words(self, name):
        name = name.lower()
        if name not in self._people:
            return -1
        if self.__msgs_spoken(name) == 0:
            return 0
        return self.__words_spoken(name) / self.__msgs_spoken(name)

    def _assert_contact(self, contact):
        assert type(contact) in [type(None), str, list], "Contact must be of type string or a list of strings"
        if type(contact) is list:
            for i, ele in enumerate(contact):
                assert type(ele) is str, "Each element in contact must be a string"
                contact[i] = ele.lower()
            for ele in contact:
                assert ele in self._people, "{0} is not in the list of people for this conversation:\n{1}".format(
                    ele, str(self._people))
        elif type(contact) is str:
            assert contact in self._people, "{0} is not in the list of people for this conversation:\n{1}".format(
                contact, str(self._people))
            contact = [contact]

        return contact

    # -----------------------------------------------   PRIVATE METHODS ---------------------------------------------- #

    def __getitem__(self, index):
        """Returns the tuple (person, message, datetime) for the corresponding index"""
        if type(index) is not int:
            raise TypeError
        elif index >= len(self) or index < -len(self):
            raise IndexError
        else:
            return self._convo[index] if index >= 0 else self._convo[len(self) + index]

    def __len__(self):
        """Returns the number of messages in self"""
        return self._len

    def __str__(self):
        """Returns a string with the alphabetically sorted names of people
        in this conversation
        """
        return "Conversation for " + self._name.title()

    def __iter__(self):
        return (message for message in self._convo)
def word_cloud_result(convo_num):
    current_convo = load_all_gui(convo_num)
    cloud_path = WordCloud.newest_file(strip_output=True)
    return render_template('word_cloud_result.html', cloud=cloud_path)