def run(url, report, destination=None, download=True): """Download IL data, extract, load, and compute some simple stats""" logger.info('*** IL Data Analysis Started ***') destination = download_and_unzip_data(url, destination) csv_path = get_datafile_path(url, destination) stops = load_csv(csv_path) analyze(stops, report)
def run(url, report, destination=None, download=True): """Download MD data, extract, load, and compute some simple stats""" logger.info('*** MD Data Analysis Started ***') destination = download_and_unzip_data(url, destination) xls_path = get_datafile_path(url, destination) stops = load_xls(xls_path) stops = process_raw_data(stops, to_drop=()) analyze(stops, report)
def run(url, destination=None, download=True): """Download MD data, extract, convert to CSV, and scan for issues""" logger.info('*** MD Data Scan Started ***') destination = download_and_unzip_data(url, destination) # Convert to CSV xls_path = get_datafile_path(url, destination) csv_path = get_csv_path(url, destination) if not os.path.exists(csv_path): xls_to_csv(xls_path, csv_path) else: logger.info("{} exists, skipping XLS->CSV conversion".format(csv_path)) csv_count = line_count(csv_path) logger.debug('Rows: {}'.format(csv_count)) scan([csv_path])
def test_download_and_unzip_data(self): """ create a temporary directory then create a zip in it, then check that it is downloaded properly """ orig_destination = tempfile.TemporaryDirectory() zip_path = os.path.join(orig_destination.name, 'foo.zip') self.make_test_zip(zip_path) url = 'http://example.com/foo.zip' # must have same basename that we create destination = download_and_unzip_data(url, orig_destination.name) self.assertEqual(orig_destination.name, destination) self.assertEqual({'foo.zip', 'file1.txt', 'file2.txt', 'file3.txt'}, set(os.listdir(orig_destination.name))) orig_destination.cleanup()
def test_download_and_unzip_data(self): """ create a temporary directory then create a zip in it, then check that it is downloaded properly """ orig_destination = tempfile.TemporaryDirectory() zip_path = os.path.join(orig_destination.name, 'foo.zip') self.make_test_zip(zip_path) url = 'http://example.com/foo.zip' # must have same basename that we create destination = download_and_unzip_data(url, orig_destination.name) self.assertEqual(orig_destination.name, destination) self.assertEqual( {'foo.zip', 'file1.txt', 'file2.txt', 'file3.txt'}, set(os.listdir(orig_destination.name)) ) orig_destination.cleanup()
def run(url, destination=None, download=True): """Download IL data, extract, and load into PostgreSQL""" logger.info('*** IL Data Import Started ***') destination = download_and_unzip_data(url, destination) # Convert to CSV raw_csv_path = get_datafile_path(url, destination) processed_csv_path = get_csv_path(url, destination) if not os.path.exists(processed_csv_path): raw_to_processed(raw_csv_path, processed_csv_path) else: logger.info("{} exists, skipping cleanup".format(processed_csv_path)) csv_count = line_count(processed_csv_path) logger.debug('Rows: {}'.format(csv_count)) # drop constraints/indexes drop_constraints_and_indexes(connections['traffic_stops_il'].cursor()) # use COPY to load CSV file as quickly as possible copy_from(processed_csv_path) # Clear the query cache flush_memcached()
def run(url, destination=None, download=True): """Download MD data, extract, convert to CSV, and load into PostgreSQL""" logger.info('*** MD Data Import Started ***') destination = download_and_unzip_data(url, destination) # Convert to CSV xls_path = get_datafile_path(url, destination) csv_path = get_csv_path(url, destination) if not os.path.exists(csv_path): xls_to_csv(xls_path, csv_path) else: logger.info("{} exists, skipping XLS->CSV conversion".format(csv_path)) csv_count = line_count(csv_path) logger.debug('Rows: {}'.format(csv_count)) # drop constraints/indexes drop_constraints_and_indexes(connections['traffic_stops_md'].cursor()) # use COPY to load CSV files as quickly as possible copy_from(csv_path) # Clear the query cache flush_memcached()
def run(url, destination=None, zip_path=None, min_stop_id=None, max_stop_id=None, prime_cache=True): """ Download NC data, extract, convert to CSV, and load into PostgreSQL :param url: if not None, zip will be downloaded from this URL; this can either be a URL supported by the requests library OR the special URL MAGIC_NC_FTP_URL, in which case the zip will be downloaded from the state of North Carolina server. :param destination: directory for unpacking zip and creating other files; pass None to create a temporary file :param zip_path: path to previously-downloaded zip :param prime_cache: whether or not to prime the query cache for "big" NC agencies after import :param max_stop_id: only process stops with ids <= this value; this is to save time for developers by reducing the amount of data to import :param min_stop_id: only process stops with ids >= this value; this is to save time for developers by reducing the amount of data to import """ if not url and not destination: raise ValueError( "destination must be provided when no URL is provided") if (min_stop_id is None) != (max_stop_id is None): raise ValueError( "provide neither or both of min_stop_id and max_stop_id") if max_stop_id is not None and min_stop_id > max_stop_id: raise ValueError("min_stop_id cannot be larger than max_stop_id") logger.info("*** NC Data Import Started ***") if url: if url == MAGIC_NC_FTP_URL: destination = nc_download_and_unzip_data(destination) else: destination = download_and_unzip_data(url, destination) else: unzip_data(destination, zip_path=zip_path) if max_stop_id is not None: truncate_input_data(destination, min_stop_id, max_stop_id) override_start_date = None else: # When processing entire dataset, pretend we don't have data from # 2000-2001 since so few agencies reported then. override_start_date = "Jan 01, 2002" # convert data files to CSV for database importing logger.info("Converting to CSV") convert_to_csv(destination) # find any new NC agencies and add to a copy of NC_agencies.csv logger.info("Looking for new NC agencies in Stops.csv") nc_agency_csv = update_nc_agencies( os.path.join(os.path.dirname(__file__), "NC_agencies.csv"), destination) # use COPY to load CSV files as quickly as possible copy_from(destination, nc_agency_csv) logger.info("NC Data Import Complete") # Clear the query cache to get rid of NC queries made on old data cache.clear() # fix landing page data facts = compute_dataset_facts(Agency, Stop, settings.NC_KEY, Search=Search, override_start_date=override_start_date) logger.info("NC dataset facts: %r", facts) # prime the query cache for large NC agencies if prime_cache: prime_cache_run()
def run(url, destination=None, zip_path=None, min_stop_id=None, max_stop_id=None, prime_cache=True): """ Download NC data, extract, convert to CSV, and load into PostgreSQL :param url: if not None, zip will be downloaded from this URL; this can either be a URL supported by the requests library OR the special URL MAGIC_NC_FTP_URL, in which case the zip will be downloaded from the state of North Carolina server. :param destination: directory for unpacking zip and creating other files; pass None to create a temporary file :param zip_path: path to previously-downloaded zip :param prime_cache: whether or not to prime the query cache for "big" NC agencies after import :param max_stop_id: only process stops with ids <= this value; this is to save time for developers by reducing the amount of data to import :param min_stop_id: only process stops with ids >= this value; this is to save time for developers by reducing the amount of data to import """ if not url and not destination: raise ValueError('destination must be provided when no URL is provided') if (min_stop_id is None) != (max_stop_id is None): raise ValueError('provide neither or both of min_stop_id and max_stop_id') if max_stop_id is not None and min_stop_id > max_stop_id: raise ValueError('min_stop_id cannot be larger than max_stop_id') logger.info('*** NC Data Import Started ***') if url: if url == MAGIC_NC_FTP_URL: destination = nc_download_and_unzip_data(destination) else: destination = download_and_unzip_data(url, destination) else: unzip_data(destination, zip_path=zip_path) if max_stop_id is not None: truncate_input_data(destination, min_stop_id, max_stop_id) override_start_date = None else: # When processing entire dataset, pretend we don't have data from # 2000-2001 since so few agencies reported then. override_start_date = 'Jan 01, 2002' # convert data files to CSV for database importing convert_to_csv(destination) # find any new NC agencies and add to a copy of NC_agencies.csv nc_agency_csv = update_nc_agencies( os.path.join(os.path.dirname(__file__), 'NC_agencies.csv'), destination ) # drop constraints/indexes drop_constraints_and_indexes(connections['traffic_stops_nc'].cursor()) # use COPY to load CSV files as quickly as possible copy_from(destination, nc_agency_csv) logger.info("NC Data Import Complete") # Clear the query cache to get rid of NC queries made on old data flush_memcached() # fix landing page data facts = compute_dataset_facts( Agency, Stop, settings.NC_KEY, Search=Search, override_start_date=override_start_date ) logger.info('NC dataset facts: %r', facts) # prime the query cache for large NC agencies if prime_cache: prime_cache_run()