def process(data, output_folder, source): mappings = { 'fake': 'fake', 'bias': 'fake', 'conspiracy': 'fake', 'junksci': 'fake', 'hate': 'fake', 'clickbait': 'fake', #'unreliable': 'fake', 'reliable': 'true' } properties = ['type', '2nd type', '3rd type'] results = [] # find the properties belonging to the mappings in the samples, and assign a single label for domain, props in data.items(): looking_at = [ prop_value for prop_name, prop_value in props.items() if prop_name in properties and prop_value ] #print(looking_at) classes = set(mappings[el] for el in looking_at if el in mappings) if len(classes) != 1: print(domain, classes) continue label = classes.pop() results.append({'domain': domain, 'label': label, 'source': source}) utils.write_json_with_path(results, output_folder, 'domains.json')
def get_claimreviews_from_factcheckers(original_claimReviews): result = {} # retrieve the full claimReview from the fact checking website for idx, c in enumerate(tqdm(claimReviews)): # get the correct URL (some of them are wrong in the original dataset) fixed_url = claimreview.get_corrected_url(c['url']) # this part with id and file saving is just to be able to restore the operation after a failure so that the single claims are saved onto disk on by one id = utils.string_to_md5(fixed_url) partial_file_name = '{}.json'.format(id) partial_file_path = subfolder_path / 'intermediate' / 'single_claims' / partial_file_name if os.path.isfile(partial_file_path): # if it's been already saved, read it partial = utils.read_json(partial_file_path) else: # otherwise download the original claimReview from the fact checker url, partial = claimreview.retrieve_claimreview(c['url']) # and save it to disk utils.write_json_with_path( partial, subfolder_path / 'intermediate' / 'single_claims', partial_file_name) if not partial: # in this case there is no claimReview metadata on the fact checker website #print(c['url']) pass if len(partial): # there can be multiple claimReviews in a single fact checking page for j, claimReview in enumerate(partial): # save this in the result result['{}::{}'.format(fixed_url, j)] = claimReview return result
def load_jsonld(): # read the file with open(source_file_path) as f: content = f.read() # extract the embedded metadata https://github.com/scrapinghub/extruct data = extruct.extract(content) claimReviews = data['json-ld'] # some analysis of the labels to see how they are annotated labels = set([el['reviewRating']['alternateName'] for el in claimReviews]) lambda_source = lambda el: el['author']['name'] # group the labels by the author of the review, to see how each one of them uses the alternateName labels_by_sources = { k: set([el['reviewRating']['alternateName'] for el in v]) for k, v in itertools.groupby(sorted(claimReviews, key=lambda_source), key=lambda_source) } print('#claimReviews', len(claimReviews)) print('#labels', len(labels)) #print('labels', labels) print('#label for each source', {k: len(v) for k, v in labels_by_sources.items()}) # save the original claimReviews utils.write_json_with_path(claimReviews, intermediate_path, 'datacommons_claimReviews.json') return claimReviews
#!/bin/env python import utils location = utils.data_location / 'wikipedia' data = utils.read_tsv(location / 'source' / 'wikipedia.tsv') domains = [{ 'domain': el['url'], 'label': el['label'], 'source': 'wikipedia' } for el in data] utils.write_json_with_path(domains, location, 'domains.json')
} print('types', cnt_by_type) by_site_fn = lambda el: el['site_url'] types_by_domain = { k: set([el['type'] for el in v]) for k, v in itertools.groupby(sorted(data, key=by_site_fn), key=by_site_fn) } mappings = { 'fake': 'fake', 'junksci': 'fake', 'hate': 'fake', 'bs': 'fake', 'bias': 'fake', 'conspiracy': 'fake' } result = [] for k, v in types_by_domain.items(): assert len(v) == 1 label = v.pop() if label in mappings: result.append({ 'domain': k, 'label': mappings[label], 'source': 'mrisdal_fakenews' }) utils.write_json_with_path(result, subfolder_path, 'domains.json')
#!/bin/env python import json import csv import utils folder = utils.data_location / 'several27_fakenews' # read the file without utils, 27.3 GB is too large for my pc # TODO limit RAM used!!! input_path = folder / 'source' / 'news_cleaned_2018_02_13.csv' output_path = folder / 'intermediate' output_file = output_path / 'unfiltered.json' results = [] chunk_n = 0 with open(input_path) as f: reader = csv.DictReader(f, delimiter=',') for row in reader: results.append({'url': row['url'], 'label': row['type'], 'source': 'several27_fakenews'}) if not len(results) % 1000000: print(len(results) * (chunk_n + 1) / 216212648) utils.write_json_with_path(results, output_path, 'unfiltered_{}.json'.format(chunk_n), indent=None) results = [] chunk_n += 1
#!/bin/env python import glob import xml.etree.ElementTree as ET import utils subfolder = utils.data_location / 'hyperpartisan' results = [] for input_file in glob.glob(str(subfolder / 'intermediate/ground-truth-*.xml')): with open(input_file) as f: tree = ET.parse(f) articles = tree.getroot().findall('article') results.extend([{ 'url': el.attrib['url'], 'label': 'fake' if el.attrib['hyperpartisan'] == 'true' else 'true', 'source': 'hyperpartisan' } for el in articles]) utils.write_json_with_path(results, subfolder, 'urls.json') utils.print_stats(results) by_domain = utils.compute_by_domain(results) utils.write_json_with_path(by_domain, subfolder, 'domains.json')
from tqdm import tqdm import utils import unshortener location = utils.data_location / 'rbutr' data = utils.read_tsv(location / 'source' / 'link_data.tab.txt') results = [{ 'url': el['sourcepage'], 'label': 'fake', 'source': 'rbutr' } for el in data] utils.write_json_with_path(results, location, 'urls.json') domains = utils.compute_by_domain(results) utils.write_json_with_path(results, location, 'domains.json') rebuttals = defaultdict(lambda: defaultdict(list)) for row in data: rebuttals[row['sourcepage']][row['rebuttalpage']].append('rbutr') utils.write_json_with_path(rebuttals, location, 'rebuttals.json') # check which urls still exist rbutr_mapping_location = location / 'intermediate' rbutr_mapping_path = rbutr_mapping_location / 'mappings.json'
#!/bin/env python import utils directory = utils.data_location / 'golbeck_fakenews' # this input file has been exported to TSV from `Fake News Stories.xlsx` input_file = directory / 'intermediate' / 'data.tsv' data = utils.read_tsv(input_file) result = [{'url': row['URL of article'], 'label': 'fake', 'source': 'golbeck_fakenews'} for row in data if row['Fake or Satire?'].strip() == 'Fake'] utils.write_json_with_path(result, directory, 'urls.json') by_domain = utils.compute_by_domain(result) utils.write_json_with_path(by_domain, directory, 'domains.json') rebuttals = {el['URL of article']: {u.strip(): ['golbeck_fakenews'] for u in el['URL of rebutting article'].split('; ')} for el in data} utils.write_json_with_path(rebuttals, directory, 'rebuttals.json')
if __name__ == '__main__': claimReviews = load_jsonld() # if you share a fact checking site, the fact checking site is true urls = [{ 'url': c['url'], 'label': 'true', 'source': 'datacommons_factcheck' } for c in claimReviews] # retrieve the claimReviews with more properties claimReviews_full = get_claimreviews_from_factcheckers(claimReviews) # save to file utils.write_json_with_path(claimReviews_full, subfolder_path, 'claimReviews.json') # rebuttals is a dict that associates each URL with other URLs that are related. In this case it is for suggesting to read the fact checking article rebuttals = defaultdict(lambda: defaultdict(list)) for key, claimReview in claimReviews_full.items(): # retrieve the URL of the source of the claim (not always there) claim_urls = claimreview.get_claim_urls(claimReview) if claim_urls: print('claim', claim_urls) if 'properties' in claimReview: fixed_url = claimreview.get_corrected_url( claimReview['properties']['url']) else: fixed_url = claimreview.get_corrected_url(claimReview['url']) # save the found mapping between the claim URL and the factchecking URL
#!/bin/env python import utils location = utils.data_location / 'factcheckni_list' data = utils.read_tsv(location / 'source' / 'FactCheckNI Articles - OU Research - Sheet1.tsv') label_map = { 'Accurate': 'true', # 'Unsubstantiated': not true nor folse, no proofs --> discard 'Inaccurate': 'fake' } labeled_urls = [{ 'url': row['Claim URL'], 'label': label_map[row['Label']], 'source': 'factcheckni_list' } for row in data if row['Label'] in label_map] rebuttals = { row['Claim URL']: { row['Article URL']: ['factcheckni_list'] } for row in data } utils.write_json_with_path(labeled_urls, location, 'urls.json') utils.write_json_with_path(rebuttals, location, 'rebuttals.json')
rebuttals = utils.read_json(utils.data_location / subfolder / 'rebuttals.json') for source_url, rebuttal_l in rebuttals.items(): for rebuttal_url, source in rebuttal_l.items(): all_rebuttals[source_url][rebuttal_url].append(source) urls_cnt = len(all_urls) domains_cnt = len(all_domains) fake_urls_cnt = len([el for el in all_urls if el['label'] == 'fake']) fake_domains_cnt = len([el for el in all_domains if el['label'] == 'fake']) print('#urls', urls_cnt, ': fake', fake_urls_cnt, 'true', urls_cnt - fake_urls_cnt) print('#domains', domains_cnt, ': fake', fake_domains_cnt, 'true', domains_cnt - fake_domains_cnt) aggregated_urls = utils.aggregate(all_urls) aggregated_domains = utils.aggregate(all_domains, 'domain') utils.write_json_with_path(aggregated_urls, utils.data_location, 'aggregated_urls.json') utils.write_json_with_path(aggregated_domains, utils.data_location, 'aggregated_domains.json') utils.write_json_with_path(all_rebuttals, utils.data_location, 'aggregated_rebuttals.json') # copy to backend utils.write_json_with_path(aggregated_urls, Path('../backend'), 'aggregated_urls.json') utils.write_json_with_path(aggregated_domains, Path('../backend'), 'aggregated_domains.json') utils.write_json_with_path(all_rebuttals, Path('../backend'), 'aggregated_rebuttals.json') utils.print_stats(aggregated_urls) utils.print_stats(aggregated_domains) print('updating mappings, it may take a while') mappings_file = utils.data_location / 'mappings.json' mappings = {}
#soup = BeautifulSoup(f, 'html.parser') #tree = etree.parse(f, etree.HTMLParser()) str = f.read() #root = tree.getroot() #matches = root.findall('a[@tabindex="-1" and target="_blank"]') #matches = soup.find_all('a', attrs={'tabindex': '-1', 'target': '_blank'}) #matches = tree.xpath('a') # look for the <a> with tabindex="-1" target="_blank" fb_urls = re.findall(r'<a\shref="([^>]*)" tabindex="-1" target="_blank"', str) real_urls = [urlparse.parse_qs(urlparse.urlparse(u).query)['u'] for u in fb_urls] unique = {u for sublist in real_urls for u in sublist} #print(unique) if len(unique) != 1: print(file_location, unique) continue id = file_location.split('/')[-1].split('.')[0] url = unique.pop() label = data[id]['label'] label_binary = {'mostly true': 'true', 'mostly false': 'fake'}.get(label, None) unfiltered.append({'url': url, 'label': label, 'source': 'buzzface'}) if label_binary: results.append({'url': url, 'label': label_binary, 'source': 'buzzface'}) utils.write_json_with_path(unfiltered, folder / 'intermediate', 'unfiltered.json') utils.write_json_with_path(results, folder, 'urls.json') by_domain = utils.compute_by_domain(results) utils.write_json_with_path(by_domain, folder, 'domains.json')