#!/bin/env python

import glob
import xml.etree.ElementTree as ET

import utils

subfolder = utils.data_location / 'hyperpartisan'

results = []
for input_file in glob.glob(str(subfolder /
                                'intermediate/ground-truth-*.xml')):
    with open(input_file) as f:
        tree = ET.parse(f)
    articles = tree.getroot().findall('article')
    results.extend([{
        'url':
        el.attrib['url'],
        'label':
        'fake' if el.attrib['hyperpartisan'] == 'true' else 'true',
        'source':
        'hyperpartisan'
    } for el in articles])

utils.write_json_with_path(results, subfolder, 'urls.json')
utils.print_stats(results)
by_domain = utils.compute_by_domain(results)

utils.write_json_with_path(by_domain, subfolder, 'domains.json')
subfolder = utils.data_location / 'jruvika_fakenews'

data = utils.read_tsv(subfolder / 'source' / 'data.csv', delimiter=',')

print(len(data))

# lots of urls are duplicated
by_url = defaultdict(set)
for el in data:
    # two rows have two different URLs each
    keys = [k.strip() for k in el['URLs'].split('; ')]
    value = 'true' if el['Label'] == '1' else 'fake'
    for k in keys:
        by_url[k].add(value)
        # be sure that when there are duplicates, the label is the same
        assert len(by_url[k]) == 1
urls = [{
    'url': k,
    'label': v.pop(),
    'source': 'jruvika_fakenews'
} for k, v in by_url.items()]
#by_url = {el['URLs'].strip(): 'true' if el['Label'] == '1' else 'fake' for el in data}
#urls = [{'url': k, 'label': v, 'source': 'jruvika_fakenews'} for k,v in by_url.items()]
print('unique urls', len(urls))

utils.write_json_with_path(urls, subfolder, 'urls.json')

by_domain = utils.compute_by_domain(urls)

utils.write_json_with_path(by_domain, subfolder, 'domains.json')