def get_vector(user_comments_full_path, most_frequent_words): """ calculate feature vector for user, based on most frequent words and basic features of that comments :param user_comments_full_path: path all comments of specific user :param most_frequent_words: set of most_frequent_words :return: feature vector for user """ comments_train = FileUtils.get_list_of_comments( user_comments_full_path) word_feq_dict_train = dict( FeatureExtractor.get_word_list_frequency(comments_train)) basic_features_value_list = FeatureExtractor.get_basic_features( comments_train) word_freq_feature_value_list = [] for word in most_frequent_words: if word in word_feq_dict_train: word_freq_feature_value_list.append(word_feq_dict_train[word]) else: word_freq_feature_value_list.append(0) vector = basic_features_value_list + word_freq_feature_value_list return vector
def get_most_frequent_word_betwenn_all_commenters( path, most_frequent_word_per_author): """get most frequent words used between all commenters (users) :param path: path of commenter's comments :param most_frequent_word_per_author: count of most frequent word per user to be consider :return: most frequent words set """ most_frequent_words = set() for users_comments_file in sorted(os.listdir(path)): """ for each author get top-most frequent word and added that to word-set""" comments_train = FileUtils.get_list_of_comments( os.sep.join([path, users_comments_file])) word_list_train = WordBaseFeatureExtractor.get_word_list_frequency( comments_train) top_words = [word_freq[0] for word_freq in word_list_train] top_words = collections.OrderedDict.fromkeys(top_words) top_words = list(top_words.keys()) top_words = [ word for word in top_words[:most_frequent_word_per_author] ] most_frequent_words |= set(list(top_words)) return most_frequent_words
def dataset_generator(self): """Generates dataset of users comments based on OneToOne method. """ percentage = Config.ONE_TO_ONE_PERCENTAGE retry_number = ( Config.ONE_TO_ONE_RETRY_NUMBER ) # for prevent bias, creation of train and test set repeated one hundred times randomly path = Config.DATASET_PATH_ORIGINAL prefix_filename = "^(\d+A)" for i in range(retry_number): for file in os.listdir(path): if file == ".gitignore": continue """ read all comments of specific user """ comments = FileUtils.get_list_of_comments( os.sep.join([path, file]) ) """ calculate count comments to be anonymous """ count_of_comments = len(comments) unknown_cm_count = (count_of_comments * percentage) // 100 """ generate a file name for the known and unknown (anonymous) path """ unknown_file_name = re.findall(prefix_filename, file)[0] base_unknown_file_path = unknown_file_name + ".txt" base_known_file_path = base_unknown_file_path """ randomly get comments to be anonymous """ unknown_set = random.sample(comments, unknown_cm_count) known_set = list(set(comments) - set(unknown_set)) """ prepare comments for write in file """ comment_unknown_text = "\n".join(unknown_set) comment_known_text = "\n".join(known_set) known_file_path = os.sep.join( [ Config.DATASET_PATH_ONE_TO_ONE, str(i), "known", base_known_file_path, ] ) unknown_file_path = os.sep.join( [ Config.DATASET_PATH_ONE_TO_ONE, str(i), "unknown", base_unknown_file_path, ] ) """ write comments in new known and unknown files based on the method """ FileUtils.write_file(unknown_file_path, comment_unknown_text) FileUtils.write_file(known_file_path, comment_known_text) pass