def buzz(mute_alerts, msg, username='******', channel='@david', icon=':bee:'): if not mute_alerts: send_to_slack(msg, username, channel, icon)
def p_watch(self): bars, header = self.projects() msg = '\n'.join(bars) send_to_slack(msg, username='******', channel='@david', icon=':film_projector:')
def main(schema, **kwparams): # Scrape location of zip file (and designation of the election): r = requests.get( "http://www.alleghenycounty.us/elections/election-results.aspx") tree = html.fromstring(r.content) #title_kodos = tree.xpath('//div[@class="custom-form-table"]/table/tbody/tr[1]/td[2]/a/@title')[0] # Xpath to find the title for the link # As the title is human-generated, it can differ from the actual text shown on the web page. # In one instance, the title was '2019 Primary', while the link text was '2019 General'. election_index = 1 # Manually increment this to re-pull older elections title_kodos = tree.xpath( '//div[@class="custom-form-table"]/table/tbody/tr[{}]/td[2]/a/text()'. format(election_index))[0] # Xpath to find the text for the link ## to the MOST RECENT election (e.g., "2017 General Election"). url = tree.xpath( '//div[@class="custom-form-table"]/table/tbody/tr[{}]/td[2]/a'.format( election_index))[0].attrib['href'] # But this looks like this: # 'http://results.enr.clarityelections.com/PA/Allegheny/71801/Web02/#/' # so it still doesn't get us that other 6-digit number needed for the # full path, leaving us to scrape that too, and it turns out that # such scraping is necessary since the directory where the zipped CSV # files are found changes too. path = dname + "/tmp" # If this path doesn't exist, create it. if not os.path.exists(path): os.makedirs(path) # Worse than that, the page is server-side generated, so one must # use something like Selenium to find out what the download link is. from selenium import webdriver from selenium.common.exceptions import TimeoutException chrome_options = webdriver.ChromeOptions() prefs = {'download.default_directory': path} chrome_options.add_experimental_option('prefs', prefs) chromedriver_path = "/usr/local/bin/chromedriver" try: chrome_options.add_argument( "--headless") # Enable headless mode to allow ETL job to chrome_options.add_argument( "--window-size=1920x1080") # run when the screen is locked. driver = webdriver.Chrome(chromedriver_path, chrome_options=chrome_options) except: driver = webdriver.Chrome("/Users/drw/Apps/Internet/chromedriver", chrome_options=chrome_options) # This is just a different location to check for chromedriver. The path # could be moved to a local preferences file. driver.get(url) # At this point, it's not possible to get the link since # the page is generated and loaded too slowly. # "the webdriver will wait for a page to load by default. It does # not wait for loading inside frames or for ajax requests. It means # when you use .get('url'), your browser will wait until the page # is completely loaded and then go to the next command in the code. # But when you are posting an ajax request, webdriver does not wait # and it's your responsibility to wait an appropriate amount of time # for the page or a part of page to load; so there is a module named # expected_conditions." delay = 15 # seconds time.sleep(delay) download_class = "pl-2" download_entities = fetch_download_entities(driver, download_class) if len(download_entities) == 0: # Fall back to older download_class (2019 Primary election and earlier # [yes, the HTML can change from election to election]). download_class = "list-download-link" download_entities = fetch_download_entities(driver, download_class) if len(download_entities) == 0: send_to_slack( "countermeasures can no longer find the part of the DOM that contains the download links.", username='******', channel='@david', icon=':satellite_antenna:') driver.quit() raise RuntimeError( "Screen-scraping error. Nothing found in class {}.".format( download_class)) summary_file_url = download_entities[0].get_attribute("href") # Download ZIP file #r = requests.get("http://results.enr.clarityelections.com/PA/Allegheny/63905/188108/reports/summary.zip") # 2016 General Election file URL #election_type = "Primary" #r = requests.get("http://results.enr.clarityelections.com/PA/Allegheny/68994/188052/reports/summary.zip") # 2017 Primary Election file URL election_type = "General" #path_for_current_results = "http://results.enr.clarityelections.com/PA/Allegheny/71801/189912/reports/" #summary_file_url = path_for_current_results + "summary.zip" r = requests.get(summary_file_url) # 2017 General Election file URL # For now, this is hard-coded. #xml_file_url = path_for_current_results + "detailxml.zip" xml_index = 2 # Previously this was 3 #xml_file_url = driver.find_elements_by_class_name(download_class)[xml_index].get_attribute("href") xml_file_url = download_entities[xml_index].get_attribute("href") found = True if re.search("xml", xml_file_url) is None: xml_index = 1 found = False #list_download_links = driver.find_elements_by_class_name(download_class) while xml_index < len(download_entities) and not found: xml_file_url = download_entities[xml_index].get_attribute("href") found = re.search("xml", xml_file_url) is not None xml_index += 1 driver.quit() print("xml_file_url = {}".format(xml_file_url)) if not found: notify_admins( "Scraping Failure: Unable to find an XML file. Countermeasures terminated." ) raise ValueError( "This ETL job is broken on account of scraping failure.") # Save result from requests to zip_file location. zip_file = dname + '/tmp/summary.zip' with open(format(zip_file), 'wb') as f: f.write(r.content) print("zip_file = {}".format(zip_file)) today = datetime.now() # Make name of hash database dependent on the server # as a very clear way of differentiating test and production # datasets. server = kwparams.get('server', "test") db = dataset.connect('sqlite:///{}/hashes-{}.db'.format(dname, server)) table = db['election'] # with open(os.path.dirname(os.path.abspath(__file__))+'/ckan_settings.json') as f: # The path of this file needs to be specified. with open(ELECTION_RESULTS_SETTINGS_FILE) as f: settings = json.load(f) site = settings['loader'][server]['ckan_root_url'] package_id = settings['loader'][server]['package_id'] API_key = settings['loader'][server]['ckan_api_key'] changed, last_hash_entry, last_modified = is_changed( table, zip_file, title_kodos) if not changed: print( "The Election Results summary file for {} seems to be unchanged.". format(title_kodos)) return else: print( "The Election Results summary file for {} does not match a previous file." .format(title_kodos)) election_type = None # Change this to force a particular election_type to be used, but it's # basically irrelevant since r_name_kang is not being used. r_name_kang = build_resource_name(today, last_modified, election_type) #r_name_kodos = re.sub(" Results"," Election Results",title_kodos) # Sample names from titles of links: # Special Election for 35th Legislative District # 2017 General Results # Election Results: 2014 Primary # Election Results: 2014 General Election # 2012 Special 40th State Sen Results # Since there's so much variation in these names, maybe it's best just # to use them without modifying them and accept that the resource # names will vary a little. They can always be cleaned up after the election. r_name_kodos = title_kodos print("Inferred name = {}, while scraped name = {}".format( r_name_kang, r_name_kodos)) r_chosen_name = r_name_kodos # Using the scraped name seems better. if r_name_kang != r_name_kodos: resource_id = find_resource_id(site, package_id, r_chosen_name, API_key=API_key) if resource_id is None: send_to_slack( "countermeasures has found two conflicting names for the resource: {} and {}. Neither can be found in the dataset. {} is being used as the default.\nThis is your reminder to move the new resources to the top of the list." .format(r_name_kodos, r_name_kang, r_name_kodos), username='******', channel='@david', icon=':satellite_antenna:') # The first time this notification fired, the Kodos name was "Special Election for 35th Legislative District" and the Kang name was "2018 General Election Results". # The second name was (incorrectly) used for storing the CSV file, while the first name was used for storing the zipped XML file. # Unzip the file filename = "summary.csv" zf = PyZipFile(zip_file).extract(filename, path=path) target = "{}/{}".format(path, filename) print("target = {}".format(target)) specify_resource_by_name = True if specify_resource_by_name: kwargs = {'resource_name': r_chosen_name} #else: #kwargs = {'resource_id': ''} # Code below stolen from prime_ckan/*/open_a_channel() but really # from utility_belt/gadgets print( "Preparing to pipe data from {} to resource {} (package ID = {}) on {}" .format(target, list(kwargs.values())[0], package_id, site)) time.sleep(1.0) pipeline = pl.Pipeline('election_results_pipeline', 'Pipeline for the County Election Results', log_status=False, settings_file=ELECTION_RESULTS_SETTINGS_FILE, settings_from_file=True, start_from_chunk=0 ) \ .connect(pl.FileConnector, target, encoding='utf-8') \ .extract(pl.CSVExtractor, firstline_headers=True) \ .schema(schema) \ .load(pl.CKANDatastoreLoader, server, fields=fields_to_publish, #package_id=package_id, #resource_id=resource_id, #resource_name=resource_name, key_fields=['line_number'], method='upsert', **kwargs).run() update_hash(db, table, zip_file, r_chosen_name, last_modified) # Also update the zipped XML file. r_xml = requests.get(xml_file_url) xml_file = dname + '/tmp/detailxml.zip' with open(format(xml_file), 'wb') as g: g.write(r_xml.content) xml_name = r_chosen_name + ' by Precinct (zipped XML file)' ckan = RemoteCKAN(site, apikey=API_key) resource_id = find_resource_id(site, package_id, xml_name, API_key=API_key) if resource_id is None: ckan.action.resource_create( package_id=package_id, url='dummy-value', # ignored but required by CKAN<2.6 name=xml_name, upload=open(xml_file, 'rb')) else: ckan.action.resource_update( package_id=package_id, url='dummy-value', # ignored but required by CKAN<2.6 id=resource_id, upload=open(xml_file, 'rb')) log = open(dname + '/uploaded.log', 'w+') if specify_resource_by_name: print("Piped data to {}".format(kwargs['resource_name'])) log.write("Finished upserting {}\n".format(kwargs['resource_name'])) else: print("Piped data to {}".format(kwargs['resource_id'])) log.write("Finished upserting {}\n".format(kwargs['resource_id'])) log.close() # Delete temp file after extraction. delete_temporary_file(zip_file) delete_temporary_file(path + '/' + filename)
def notify_admins(msg): print(msg) send_to_slack(msg, username='******', channel='#other-notifications', icon=':satellite_antenna:')
try: if len(sys.argv) > 1: server = sys.argv[1] # When invoking this function from the command line, the # argument 'production' must be given to push data to # a public repository. Otherwise, it will default to going # to a test directory. main(schema, server=server) # Note that the hash database is currently unaware of which # server a file is saved to, so if it's first saved to # the test server and you run the ETL script again for the # production server, if the file hasn't changed, the script # will not push the data to the production server. else: main(schema) except: e = sys.exc_info()[0] print("Error: {} : ".format(e)) exc_type, exc_value, exc_traceback = sys.exc_info() lines = traceback.format_exception(exc_type, exc_value, exc_traceback) traceback_msg = ''.join('!! ' + line for line in lines) print(traceback_msg) # Log it or whatever here msg = "countermeasures ran into an error: {}.\nHere's the traceback:\n{}".format( e, traceback_msg) mute_alerts = False #kwargs.get('mute_alerts',False) if not mute_alerts: send_to_slack(msg, username='******', channel='@david', icon=':satellite_antenna:')
# https://github.com/WPRDC/data-guide/blob/master/docs/metadata_extras.md # The format is like this: # u'extras': [{u'key': u'dcat_issued', u'value': u'2014-01-07T15:27:45.000Z'}, ... # not a dict, but a list of dicts. extras = {d['key']: d['value'] for d in extras_list} #if 'dcat_issued' not in extras: if 'time_field' in extras: time_field_lookup = json.loads(extras['time_field']) fix_temporal_coverage(package['id'],time_field_lookup,just_testing) from credentials import production try: if __name__ == '__main__': just_testing = False if len(sys.argv) > 1: if sys.argv[1] == 'True': just_testing = True elif sys.argv[1] == 'False': just_testing = False main(just_testing=just_testing) except: e = sys.exc_info()[0] msg = "Error: {} : \n".format(e) exc_type, exc_value, exc_traceback = sys.exc_info() lines = traceback.format_exception(exc_type, exc_value, exc_traceback) msg = ''.join('!! ' + line for line in lines) msg = "watchdog.py failed for some reason.\n" + msg print(msg) # Log it or whatever here if not just_testing and production: send_to_slack(msg,username='******',channel='@david',icon=':doge:')
def main(mute_alerts=True, check_private_datasets=False, skip_watchdog=False, test_mode=False): if not skip_watchdog: watchdog.main(just_testing=False) if False: # [ ] The code in this branch can be eliminated. host = "data.wprdc.org" url = "https://{}/api/3/action/current_package_list_with_resources?limit=999999".format( host) r = requests.get(url) response = r.json() if not response['success']: msg = "Unable to get the package list." print(msg) raise ValueError(msg) packages = response['result'] else: from credentials import site, ckan_api_key as API_key if not check_private_datasets: API_key = None ckan = ckanapi.RemoteCKAN(site, apikey=API_key) try: packages = ckan.action.current_package_list_with_resources( limit=999999) except: packages = ckan.action.current_package_list_with_resources( limit=999999) period = { 'Annually': timedelta(days=366), 'Bi-Annually': timedelta(days=183), 'Quarterly': timedelta(days=31 + 30 + 31), 'Bi-Monthly': timedelta(days=31 + 30), 'Monthly': timedelta(days=31), 'Bi-Weekly': timedelta(days=14), 'Weekly': timedelta( days=7 ), # 'Weekdays' could be another period, though it seems I'm coding exceptions into the no_updates_on metadata field. 'Daily': timedelta(days=1), 'Hourly': timedelta(hours=1), 'Multiple Times per Hour': timedelta(minutes=30) } # Some datasets are showing up as stale for one day because # (for instance) the County doesn't post jail census data # on a given day to their FTP server; our ETL script runs # but it doesn't update the metadata_modified. # One better solution to this would be to create a package- # (and maybe also resource-) level metadata field called # etl_job_last_ran. # [ ] These hard-coded exceptions can now be moved to package-level metadata. extensions = {} extensions['d15ca172-66df-4508-8562-5ec54498cfd4'] = { 'title': 'Allegheny County Jail Daily Census', 'extra_time': timedelta(days=1), 'actual_data_source_reserve': timedelta(days=15) } extensions['046e5b6a-0f90-4f8e-8c16-14057fd8872e'] = { 'title': 'Police Incident Blotter (30 Day)', 'extra_time': timedelta(days=1) } nonperiods = ['', 'As Needed', 'Not Updated (Historical Only)'] packages_with_frequencies = 0 stale_count = 0 stale_packages = {} for i, package in enumerate(packages): if 'frequency_publishing' in package.keys(): title = package['title'] package_id = package['id'] dataset_url = "https://data.wprdc.org/dataset/{}".format( package['name']) metadata_modified = datetime.strptime(package['metadata_modified'], "%Y-%m-%dT%H:%M:%S.%f") publishing_frequency = package['frequency_publishing'] data_change_rate = package['frequency_data_change'] publisher = package['organization']['title'] private = package['private'] if private: title = "(private) " + title temporal_coverage_end_date = temporal_coverage_end( package ) # Check for 'time_field' and auto-updated temporal_coverage field if publishing_frequency in period: publishing_period = period[publishing_frequency] else: publishing_period = None if publishing_frequency not in nonperiods: raise ValueError( "{}) {}: {} is not a known publishing frequency". format(k, title, publishing_frequency)) #print("{} ({}) was last modified {} (according to its metadata). {}".format(title,package_id,metadata_modified,package['frequency_publishing'])) if publishing_period is not None: no_updates_on = get_scheduled_gaps(package) lateness = compute_lateness( extensions, package, package_id, publishing_period, metadata_modified ) # Include no_updates_on here if the ETL jobs # get rescheduled to match actual data updates (rather than state update frequency). if temporal_coverage_end_date is not None: temporal_coverage_end_dt = datetime.strptime( temporal_coverage_end_date, "%Y-%m-%d") + timedelta( days=1 ) # [ ] This has no time zone associated with it. # [ ] Change this to use parser.parse to allow times to be included, but also think more carefully about adding that offset. # Note that temporal_coverage_end_dt is advanced by one one day (to be the first day after the temporal coverage) and # also is technically a datetime but is actually just date information, with the time information thrown out. data_lateness = compute_lateness(extensions, package, package_id, publishing_period, temporal_coverage_end_dt, no_updates_on) else: data_lateness = timedelta(seconds=0) if lateness.total_seconds() > 0 or data_lateness.total_seconds( ) > 0: # Either kind of lateness triggers the listing of another stale package. stale_packages[package_id] = { 'publishing_frequency': publishing_frequency, 'data_change_rate': data_change_rate, 'publisher': publisher, 'json_index': i, 'title': title, 'package_id': package_id, 'package_url': dataset_url, 'upload_method': infer_upload_method(package), 'url': dataset_url } if lateness.total_seconds() > 0: stale_packages[package_id][ 'cycles_late'] = lateness.total_seconds( ) / publishing_period.total_seconds() stale_packages[package_id][ 'last_modified'] = metadata_modified stale_packages[package_id][ 'days_late'] = lateness.total_seconds() / (60.0 * 60 * 24) else: stale_packages[package_id]['cycles_late'] = 0 stale_packages[package_id][ 'last_modified'] = metadata_modified stale_packages[package_id]['days_late'] = 0.0 #if temporal_coverage_end_date is not None: if data_lateness.total_seconds() > 0: stale_packages[package_id][ 'temporal_coverage_end'] = temporal_coverage_end_date # This is a string. stale_packages[package_id][ 'data_cycles_late'] = data_lateness.total_seconds( ) / publishing_period.total_seconds() # Describe the evidence that the package is stale. output = "{}) {} updates {}".format( i, title, package['frequency_publishing']) if lateness.total_seconds( ) > 0 and data_lateness.total_seconds() > 0: output += " but metadata_modified = {} and temporal_coverage_end_date = {} making it DOUBLE STALE!".format( metadata_modified, temporal_coverage_end_date) elif lateness.total_seconds() > 0: output += " but metadata_modified = {} making it STALE!".format( metadata_modified) elif data_lateness.total_seconds() > 0: output += " but temporal_coverage_end_date = {} making it STALE!".format( temporal_coverage_end_date) stale_packages[package_id]['output'] = output stale_count += 1 packages_with_frequencies += 1 # Sort stale packages by relative tardiness so the most recently tardy ones # appear at the bottom of the output and the most egregiously late ones # at the top. #stale_ps_sorted = sorted(stale_packages.iteritems(), key=lambda(k,v): -v['cycles_late']) #Note that in Python 3, key=lambda(k,v): v['position'] must be written as key=lambda k_v: k_v[1]['position'] stale_ps_sorted = sorted(stale_packages.items(), key=lambda k_v: -k_v[1]['cycles_late']) print("\nDatasets by Staleness: ") print_table(stale_ps_sorted) stale_ps_by_recency = sorted(stale_packages.items(), key=lambda k_v: -k_v[1]['days_late']) print("\n\nStale Datasets by Lateness: ") print_table(stale_ps_by_recency) stale_ps_by_data_lateness = { p_id: sp for p_id, sp in stale_packages.items() if 'temporal_coverage_end' in sp } stale_ps_by_data_lateness = sorted( stale_ps_by_data_lateness.items(), key=lambda k_v: -k_v[1]['data_cycles_late']) if len(stale_ps_by_data_lateness) > 0: print("\n\nStale Datasets by Data-Lateness: ") print_table(stale_ps_by_data_lateness, 'data-lateness') else: print("No datasets are stale by data-lateness.") coda = "Out of {} packages, only {} have specified publication frequencies. {} are stale (past their refresh-by date), according to the metadata_modified field.".format( len(packages), packages_with_frequencies, stale_count) print(textwrap.fill(coda, 70)) # Store list of stale packages in a JSON file as a record of the last # glance (with the intent of sending notifications whenever new ones show up). currently_stale = [] previously_stale = load_from_json() previously_stale_ids = [x['id'] for x in previously_stale] newly_stale = [] for sp in stale_ps_by_recency: r = {'id': sp[0], 'title': sp[1]['title']} currently_stale.append(r) if sp[0] not in previously_stale_ids: newly_stale.append(sp) wprdc_datasets = [ '22fe57da-f5b8-4c52-90ea-b10591a66f90', # Liens 'f2141a79-c0b9-4cf9-b4d2-d591b4aaa8e6' # Foreclosures ] # These are WPRDC-maintained datasets. if len(newly_stale) > 0: printable_stale_items = [ "{} ({})".format(sp[1]['title'], sp[1]['package_url']) for sp in newly_stale ] linked_stale_items = [ "<{}|{}> ({})".format(sp[1]['package_url'], sp[1]['title'], sp[1]['upload_method']) for sp in newly_stale ] includes_etl_string = " (includes ETL job)" if any( [sp[1]['upload_method'] == 'etl' for sp in newly_stale]) else "" msg = "NEWLY STALE{}: {}".format( includes_etl_string, ', '.join(linked_stale_items)) # formatted for Slack printable_msg = "NEWLY STALE{}: {}".format( includes_etl_string, ', '.join(printable_stale_items)) print(printable_msg) if not mute_alerts: send_to_slack(msg, username='******', channel='#stale-datasets', icon=':illuminati:') other_notifications = [{ 'publisher': 'Allegheny County', 'medium': 'Slack', 'channel': '#county-stale-datasets', 'slack_group': 'wprdc-and-friends', 'slack-config': 'something' }] for other in other_notifications: if other['publisher'] in [ sp[1]['publisher'] for sp in newly_stale ]: publisher_stale_sets = [] for sp in newly_stale: if other['publisher'] == sp[1]['publisher'] and sp[ 0] not in wprdc_datasets: publisher_stale_sets.append(sp) publisher_stale_ones = [ "<{}|{}>".format(sp[1]['url'], sp[1]['title']) for sp in publisher_stale_sets ] if len(publisher_stale_ones) > 0: printable_publisher_stale_ones = [ sp[1]['title'] for sp in publisher_stale_sets ] multiple = len(publisher_stale_ones) != 1 publisher_msg = "Hey there! I just noticed {} newly stale {}: {}".format( len(publisher_stale_ones), pluralize("dataset", publisher_stale_ones, False), ', '.join(publisher_stale_ones)) #send_to_different_slack: wprdc-and-friends print(publisher_msg) send_to_slack(publisher_msg, username='******', channel='#county-stale-datasets', slack_group=other['slack_group']) #send_to_slack(publisher_msg,username='******',channel='#boring-tests',slack_group=other['slack_group']) else: print("[Slack alerts are muted.]") store_as_json(currently_stale)
elif arg in ['test']: test_mode = True args.remove(arg) elif arg in ['production']: test_mode = False args.remove(arg) elif arg in ['private']: check_private_datasets = True args.remove(arg) elif arg in ['skip', 'snooze']: skip_watchdog = True args.remove(arg) if len(args) > 0: print("Unused command-line arguments: {}".format(args)) main(mute_alerts, check_private_datasets, skip_watchdog, test_mode) except: e = sys.exc_info()[0] msg = "Error: {} : \n".format(e) exc_type, exc_value, exc_traceback = sys.exc_info() lines = traceback.format_exception(exc_type, exc_value, exc_traceback) msg = ''.join('!! ' + line for line in lines) msg = "pocket_watch/glance.py failed for some reason.\n" + msg print(msg) # Log it or whatever here if production: send_to_slack(msg, username='******', channel='@david', icon=':illuminati:')