def read_tweet_json(folder, id_thread): source_tweet_json_file = open(folder + "/source-tweet/" + id_thread + ".json", "r") source_tweet_json = source_tweet_json_file.read() source_tweet = json.loads(source_tweet_json, encoding='utf-8') source_tweet_json_file.close() reply_tweets = {} reply_tweets_json_files = glob.glob(folder + "/replies/*") for reply_tweet_json_file in reply_tweets_json_files: reply_thread_id = reply_tweet_json_file.split("\\")[2].split(".")[0] reply_tweet_json_file = open(reply_tweet_json_file, "r") reply_tweet_json = reply_tweet_json_file.read() reply_tweet = json.loads(reply_tweet_json, encoding='utf-8') reply_tweets[reply_thread_id] = reply_tweet reply_tweet_json_file.close() reply_structure_json_file = open(folder + "/structure.json", "r") reply_structure_json = reply_structure_json_file.read() reply_structure = json.loads(reply_structure_json, encoding='utf-8') reply_structure_json_file.close() thread = ThreadTree(id_thread, source_tweet, reply_tweets, 'data/tweets/') thread.build_tree(reply_structure) return thread
def get_threads_from_story(dataset_folder, story): folders = glob.glob(dataset_folder + story + '/*') ids = [ folder.split("\\")[1] for folder in folders ] return folders, ids
def get_threads_from_story(story): folders = glob.glob(twitter_test_set_folder + story + '/*') ids = [folder.split("\\")[1] for folder in folders] return folders, ids
if __name__ == "__main__": folder_paths = ["data/class_labels", "data/datasets", "data/features/labels", "data/features/tf_idf", "data/features/top", "data/features/word2vec", "data/test_tweets", "data/tf_idf_documents", "data/thread_lists", "data/threads", "data/tweets", "data/vocabularies", "data/word2vec_documents", "models/", "scores/ablation", "submissions/", "plots/"] for folder_path in folder_paths: if not exists(folder_path): makedirs(folder_path) twitter_dataset_folder = "../rumoureval-2019-training-data/twitter-english/" if not isfile('data/twitter_stories'): twitter_stories_folders = glob.glob(twitter_dataset_folder + "*") twitter_stories = [ story_folder.split("\\")[1] for story_folder in twitter_stories_folders ] save_object(twitter_stories, 'data/twitter_stories') else: twitter_stories = load_object('data/twitter_stories') print('Stories: ' + str(twitter_stories)) if not isfile('data/annotations'): print('Reading tweet annotations...') annotations_file = open( "../pheme-rumour-scheme-dataset/annotations/en-scheme-annotations.json", "r") annotations_data = [line.rstrip('\n') for line in annotations_file] annotation_files_veracity = glob.glob('../pheme-rumour-scheme-dataset/threads/en/*/*/annotation.json',
return thread def create_submission_file(task_a_result, task_b_result): submission_data = { "subtaskaenglish": task_a_result, "subtaskbenglish": task_b_result } submission_data_json = json.dumps(submission_data) submission_file = open('./submissions/answer.json', 'w') submission_file.write(submission_data_json) submission_file.close() if __name__ == '__main__': twitter_stories_folders = glob.glob(twitter_test_set_folder + "*") twitter_stories = [ story_folder.split("\\")[1] for story_folder in twitter_stories_folders ] print('Stories in test set: ' + str(twitter_stories)) print('Processing tweet threads...') graph_features = {} for twitter_story in twitter_stories: thread_folders, thread_ids = get_threads_from_story(twitter_story) print('Number of threads about ' + twitter_story + ': ' + str(len(thread_ids))) for thread_folder, thread_id in zip(thread_folders, thread_ids): thread_tree = read_tweet_json(thread_folder, thread_id) thread_nx_graph = thread_tree.get_nx_graph() graph_features.update(