from pathlib import Path
import json
from typing import Dict
from sample import Sample

if __name__ == "__main__":
    root = Path("../tweet-data/merged")
    biden_dir = root / "biden"
    trump_dir = root / "trump"
    save_dir = root.parent / (root.stem + "_no_dups")

    # remove samples duplicated within the same keywords
    biden_kw_data_lists = [
        list(
            Sample.get_id_and_sample(file_path)
            for file_path in path.glob("*.json"))
        for path in biden_dir.glob("*") if path.is_dir()
    ]
    trump_kw_data_lists = [
        list(
            Sample.get_id_and_sample(file_path)
            for file_path in path.glob("*.json"))
        for path in trump_dir.glob("*") if path.is_dir()
    ]

    biden_kw_data_dicts = [
        dict(keyword_data) for keyword_data in biden_kw_data_lists
    ]
    trump_kw_data_dicts = [
        dict(keyword_data) for keyword_data in trump_kw_data_lists
    ]