def scrape_extract_store_rss(site_name, db): latest_date = db.get_latest_date_of_source(site_name) for newsflash in rss_sites.scrape(site_name): if newsflash.date <= latest_date: break # TODO: pass both title and description, leaving this choice to the classifier newsflash.accident = classify_rss(newsflash.title or newsflash.description) newsflash.organization = classify_organization(site_name) if newsflash.accident: # FIX: No accident-accurate date extracted extract_geo_features(db, newsflash) db.insert_new_newsflash(newsflash)
def test_classification_statistics_ynet(): # The classification in the file is "definitional", meaning: # We don't care if it is "about" an accident, but rather whether it us "THE report". # In other words, is it the _first_ report about a _recent_ accident with open('tests/accidents_definitional_ynet.tsv', encoding='utf8') as f: data = [line.split('\t') for line in f.read().split('\n')] stats = {True: {True: 0, False: 0}, False: {True: 0, False: 0}} for title, expected in data: expected = bool(int(expected)) actual = classify_rss(title) stats[expected][actual] += 1 tp = stats[True][True] fp = stats[False][True] fn = stats[True][False] precision = tp / (tp + fp) recall = tp / (tp + fn) f1 = 2 * precision * recall / (precision + recall) # These constants should (hopefully) only be updated upwards assert precision > BEST_PRECISION_YNET assert recall > BEST_RECALL_YNET assert f1 > BEST_F1_YNET