예제 #1
0
def read_tweet_json(folder, id_thread):
    source_tweet_json_file = open(folder + "/source-tweet/" + id_thread + ".json", "r")
    source_tweet_json = source_tweet_json_file.read()
    source_tweet = json.loads(source_tweet_json, encoding='utf-8')
    source_tweet_json_file.close()
    reply_tweets = {}
    reply_tweets_json_files = glob.glob(folder + "/replies/*")
    for reply_tweet_json_file in reply_tweets_json_files:
        reply_thread_id = reply_tweet_json_file.split("\\")[2].split(".")[0]
        reply_tweet_json_file = open(reply_tweet_json_file, "r")
        reply_tweet_json = reply_tweet_json_file.read()
        reply_tweet = json.loads(reply_tweet_json, encoding='utf-8')
        reply_tweets[reply_thread_id] = reply_tweet
        reply_tweet_json_file.close()
    reply_structure_json_file = open(folder + "/structure.json", "r")
    reply_structure_json = reply_structure_json_file.read()
    reply_structure = json.loads(reply_structure_json, encoding='utf-8')
    reply_structure_json_file.close()
    thread = ThreadTree(id_thread, source_tweet, reply_tweets, 'data/tweets/')
    thread.build_tree(reply_structure)
    return thread
예제 #2
0
def get_threads_from_story(dataset_folder, story):
    folders = glob.glob(dataset_folder + story + '/*')
    ids = [
        folder.split("\\")[1] for folder in folders
    ]
    return folders, ids
예제 #3
0
def get_threads_from_story(story):
    folders = glob.glob(twitter_test_set_folder + story + '/*')
    ids = [folder.split("\\")[1] for folder in folders]
    return folders, ids
예제 #4
0

if __name__ == "__main__":

    folder_paths = ["data/class_labels", "data/datasets", "data/features/labels", "data/features/tf_idf",
                    "data/features/top", "data/features/word2vec", "data/test_tweets", "data/tf_idf_documents",
                    "data/thread_lists", "data/threads", "data/tweets", "data/vocabularies",
                    "data/word2vec_documents", "models/", "scores/ablation", "submissions/", "plots/"]

    for folder_path in folder_paths:
        if not exists(folder_path):
            makedirs(folder_path)

    twitter_dataset_folder = "../rumoureval-2019-training-data/twitter-english/"
    if not isfile('data/twitter_stories'):
        twitter_stories_folders = glob.glob(twitter_dataset_folder + "*")
        twitter_stories = [
            story_folder.split("\\")[1] for story_folder in twitter_stories_folders
        ]
        save_object(twitter_stories, 'data/twitter_stories')
    else:
        twitter_stories = load_object('data/twitter_stories')

    print('Stories: ' + str(twitter_stories))

    if not isfile('data/annotations'):
        print('Reading tweet annotations...')
        annotations_file = open(
            "../pheme-rumour-scheme-dataset/annotations/en-scheme-annotations.json", "r")
        annotations_data = [line.rstrip('\n') for line in annotations_file]
        annotation_files_veracity = glob.glob('../pheme-rumour-scheme-dataset/threads/en/*/*/annotation.json',
예제 #5
0
    return thread


def create_submission_file(task_a_result, task_b_result):
    submission_data = {
        "subtaskaenglish": task_a_result,
        "subtaskbenglish": task_b_result
    }
    submission_data_json = json.dumps(submission_data)
    submission_file = open('./submissions/answer.json', 'w')
    submission_file.write(submission_data_json)
    submission_file.close()


if __name__ == '__main__':
    twitter_stories_folders = glob.glob(twitter_test_set_folder + "*")
    twitter_stories = [
        story_folder.split("\\")[1] for story_folder in twitter_stories_folders
    ]
    print('Stories in test set: ' + str(twitter_stories))

    print('Processing tweet threads...')
    graph_features = {}
    for twitter_story in twitter_stories:
        thread_folders, thread_ids = get_threads_from_story(twitter_story)
        print('Number of threads about ' + twitter_story + ': ' +
              str(len(thread_ids)))
        for thread_folder, thread_id in zip(thread_folders, thread_ids):
            thread_tree = read_tweet_json(thread_folder, thread_id)
            thread_nx_graph = thread_tree.get_nx_graph()
            graph_features.update(