def __build_tree_r(self, node, reply_structure): if len(reply_structure) == 0: return for child_tweet_id, child_reply_structure in reply_structure.items(): if child_tweet_id in self.reply_tweets.keys(): child_tweet = self.reply_tweets[child_tweet_id] else: child_tweet = None if ThreadTree.tweet_annotations is not None and child_tweet_id in ThreadTree.tweet_annotations.keys( ): child = Tweet(child_tweet_id, child_tweet, node, self.root, ThreadTree.tweet_annotations[child_tweet_id]) else: child = Tweet(child_tweet_id, child_tweet, node, self.root, None) node.children.append(child) if not isfile(self.tweet_folder + child_tweet_id): save_object(child, self.tweet_folder + child_tweet_id) self.__build_tree_r(child, child_reply_structure)
return thread if __name__ == "__main__": folder_paths = ["data/class_labels", "data/datasets", "data/features/labels", "data/features/tf_idf", "data/features/top", "data/features/word2vec", "data/test_tweets", "data/tf_idf_documents", "data/thread_lists", "data/threads", "data/tweets", "data/vocabularies", "data/word2vec_documents", "models/", "scores/ablation", "submissions/", "plots/"] for folder_path in folder_paths: if not exists(folder_path): makedirs(folder_path) twitter_dataset_folder = "../rumoureval-2019-training-data/twitter-english/" if not isfile('data/twitter_stories'): twitter_stories_folders = glob.glob(twitter_dataset_folder + "*") twitter_stories = [ story_folder.split("\\")[1] for story_folder in twitter_stories_folders ] save_object(twitter_stories, 'data/twitter_stories') else: twitter_stories = load_object('data/twitter_stories') print('Stories: ' + str(twitter_stories)) if not isfile('data/annotations'): print('Reading tweet annotations...') annotations_file = open( "../pheme-rumour-scheme-dataset/annotations/en-scheme-annotations.json", "r") annotations_data = [line.rstrip('\n') for line in annotations_file]
def build_tree(self, reply_structure): source_tweet_id, source_reply_structure = tuple( reply_structure.items())[0] if not isfile(self.tweet_folder + source_tweet_id): save_object(self.root, self.tweet_folder + source_tweet_id) self.__build_tree_r(self.root, source_reply_structure)
print('Stories in test set: ' + str(twitter_stories)) print('Processing tweet threads...') graph_features = {} for twitter_story in twitter_stories: thread_folders, thread_ids = get_threads_from_story(twitter_story) print('Number of threads about ' + twitter_story + ': ' + str(len(thread_ids))) for thread_folder, thread_id in zip(thread_folders, thread_ids): thread_tree = read_tweet_json(thread_folder, thread_id) thread_nx_graph = thread_tree.get_nx_graph() graph_features.update( get_graph_features(thread_nx_graph, thread_tree)) tweet_files = glob.glob('data/test_tweets/*') if not isfile('data/datasets/test_dataset_dictionary'): tf_idf_document_sets = [{} for i in range(8)] language_style_features_tweets = {} language_style_features_user_descriptions = {} sentiment_features = {} extra_features = {} for i, tweet_file in enumerate(tweet_files): print('Processing tweet #' + str(i + 1) + '...') tweet = load_object(tweet_file) tweet_id = tweet.id tweet_words = tweet.get_words_from_tweet_text() tweet_ngrams = get_ngrams_from_words(tweet_words) for n, ngrams in enumerate(tweet_ngrams):
] dataset_without_columns = dataset.drop(language_style_columns, axis=1) datasets.append(dataset_without_columns) dataset_without_columns = dataset.drop(tf_idf_word2vec_columns, axis=1) datasets.append(dataset_without_columns) dataset_without_columns = dataset.drop(sentiment_columns, axis=1) datasets.append(dataset_without_columns) dataset_without_columns = dataset.drop(graph_columns, axis=1) datasets.append(dataset_without_columns) dataset_without_columns = dataset.drop(extra_columns, axis=1) datasets.append(dataset_without_columns) return datasets if __name__ == "__main__": if not isfile('scores/ablation/task_a_comment_scores.tsv'): print("Performing ablation experiment for comment model...") comment_dataset = pd.read_csv( 'data/datasets/task_a_comment_dataset.tsv', sep='\t', index_col=False, header=0, encoding='utf-8') comment_class_labels = load_object( 'data/class_labels/task_a_comment_class_labels') comment_ablation_scores = {} ablation_datasets = get_ablation_datasets(comment_dataset) del comment_dataset for ablation_label, ablation_dataset in zip(ablation_labels,