Exemplo n.º 1
0
    def get_vector(user_comments_full_path, most_frequent_words):
        """ calculate feature vector for user, based on most frequent words and basic features of that comments
        :param user_comments_full_path: path all comments of specific user
        :param most_frequent_words: set of most_frequent_words
        :return: feature vector for user
        """
        comments_train = FileUtils.get_list_of_comments(
            user_comments_full_path)

        word_feq_dict_train = dict(
            FeatureExtractor.get_word_list_frequency(comments_train))

        basic_features_value_list = FeatureExtractor.get_basic_features(
            comments_train)

        word_freq_feature_value_list = []

        for word in most_frequent_words:
            if word in word_feq_dict_train:
                word_freq_feature_value_list.append(word_feq_dict_train[word])
            else:
                word_freq_feature_value_list.append(0)

        vector = basic_features_value_list + word_freq_feature_value_list
        return vector
    def get_most_frequent_word_betwenn_all_commenters(
            path, most_frequent_word_per_author):
        """get most frequent words used between all commenters (users)
        :param path: path of commenter's comments
        :param most_frequent_word_per_author: count of most frequent word per user to be consider
        :return: most frequent words set
        """
        most_frequent_words = set()
        for users_comments_file in sorted(os.listdir(path)):
            """ for each author get top-most frequent word and added that to word-set"""
            comments_train = FileUtils.get_list_of_comments(
                os.sep.join([path, users_comments_file]))
            word_list_train = WordBaseFeatureExtractor.get_word_list_frequency(
                comments_train)

            top_words = [word_freq[0] for word_freq in word_list_train]
            top_words = collections.OrderedDict.fromkeys(top_words)
            top_words = list(top_words.keys())
            top_words = [
                word for word in top_words[:most_frequent_word_per_author]
            ]
            most_frequent_words |= set(list(top_words))
        return most_frequent_words
Exemplo n.º 3
0
    def dataset_generator(self):
        """Generates dataset of users comments based on OneToOne method.
        """

        percentage = Config.ONE_TO_ONE_PERCENTAGE
        retry_number = (
            Config.ONE_TO_ONE_RETRY_NUMBER
        )  # for prevent bias, creation of train and test set repeated one hundred times randomly

        path = Config.DATASET_PATH_ORIGINAL
        prefix_filename = "^(\d+A)"

        for i in range(retry_number):

            for file in os.listdir(path):
                if file == ".gitignore":
                    continue

                """ read all comments of specific user """
                comments = FileUtils.get_list_of_comments(
                    os.sep.join([path, file])
                )

                """ calculate count comments to be anonymous """
                count_of_comments = len(comments)
                unknown_cm_count = (count_of_comments * percentage) // 100

                """ generate a file name for the known and unknown (anonymous) path """
                unknown_file_name = re.findall(prefix_filename, file)[0]
                base_unknown_file_path = unknown_file_name + ".txt"
                base_known_file_path = base_unknown_file_path

                """ randomly get comments to be anonymous """
                unknown_set = random.sample(comments, unknown_cm_count)
                known_set = list(set(comments) - set(unknown_set))

                """ prepare comments for write in file """
                comment_unknown_text = "\n".join(unknown_set)
                comment_known_text = "\n".join(known_set)

                known_file_path = os.sep.join(
                    [
                        Config.DATASET_PATH_ONE_TO_ONE,
                        str(i),
                        "known",
                        base_known_file_path,
                    ]
                )

                unknown_file_path = os.sep.join(
                    [
                        Config.DATASET_PATH_ONE_TO_ONE,
                        str(i),
                        "unknown",
                        base_unknown_file_path,
                    ]
                )

                """ write comments in new known and unknown files based on the method """
                FileUtils.write_file(unknown_file_path, comment_unknown_text)
                FileUtils.write_file(known_file_path, comment_known_text)
        pass