def set_options(self, *args, **kwargs): self.url = 'http://campaignfinance.cdn.sos.ca.gov/dbwebexport.zip' if kwargs['test_data']: self.data_dir = get_test_download_directory() settings.CALACCESS_DOWNLOAD_DIR = self.data_dir else: self.data_dir = get_download_directory() os.path.exists(self.data_dir) or os.mkdir(self.data_dir) self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') self.tsv_dir = os.path.join(self.data_dir, "tsv/") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.mkdir(self.csv_dir) if kwargs['download']: self.download_metadata = self.get_download_metadata() self.local_metadata = self.get_local_metadata() prompt_context = dict( last_updated=self.download_metadata['last-modified'], time_ago=naturaltime(self.download_metadata['last-modified']), size=size(self.download_metadata['content-length']), last_download=self.local_metadata['last-download'], download_dir=self.data_dir, ) self.prompt = render_to_string( 'calaccess_raw/downloadcalaccessrawdata.txt', prompt_context, ) self.verbosity = int(kwargs['verbosity'])
def handle(self, *args, **options): self.verbosity = options.get("verbosity") self.no_color = options.get("no_color") self.raw_data_files = RawDataFile.objects self.data_dir = get_test_download_directory() self.tsv_dir = os.path.join(self.data_dir, "tsv/") self.zip_path = os.path.join(self.data_dir, self.url.split('/')[-1]) with open(self.data_dir + "/sampled_version.txt", "r") as f: release_datetime = f.readline() size = f.readline() try: self.version = RawDataVersion.objects.get( release_datetime=release_datetime ) except RawDataVersion.DoesNotExist: self.version = RawDataVersion.objects.create( release_datetime=release_datetime, size=size ) self.unzip() self.prep() self.track_files() if getattr(settings, 'CALACCESS_STORE_ARCHIVE', False): self.archive()
def set_options(self, *args, **kwargs): self.url = 'http://campaignfinance.cdn.sos.ca.gov/dbwebexport.zip' self.verbosity = int(kwargs['verbosity']) if kwargs['test_data']: self.data_dir = get_test_download_directory() settings.CALACCESS_DOWNLOAD_DIR = self.data_dir if self.verbosity: self.log("Using test data") else: self.data_dir = get_download_directory() os.path.exists(self.data_dir) or os.makedirs(self.data_dir) self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') self.tsv_dir = os.path.join(self.data_dir, "tsv/") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir) if kwargs['download']: self.download_metadata = self.get_download_metadata() self.local_metadata = self.get_local_metadata() prompt_context = dict( last_updated=self.download_metadata['last-modified'], time_ago=naturaltime(self.download_metadata['last-modified']), size=size(self.download_metadata['content-length']), last_download=self.local_metadata['last-download'], download_dir=self.data_dir, ) self.prompt = render_to_string( 'calaccess_raw/downloadcalaccessrawdata.txt', prompt_context, )
def set_config(self, *args, **options): self.data_dir = get_download_directory() self.test_data_dir = get_test_download_directory() self.tsv_dir = os.path.join(self.data_dir, "tsv/") self.sample_dir = os.path.join(self.test_data_dir, "tsv/") self.sample_rows = int(options['samplerows']) self.tsv_list = os.listdir(self.tsv_dir) self.verbosity = int(options['verbosity'])
def handle(self, *args, **options): super(Command, self).handle(*args, **options) # set / compute any attributes that multiple class methods need self.app_name = options["app_name"] self.database = options["database"] self.keep_files = options["keep_files"] if options['test_data']: # if using test data, we don't need to download options["download"] = False # and always keep files when running test data self.keep_files = True if options['test_data']: self.data_dir = get_test_download_directory() # need to set this app-wide because cleancalaccessrawfile # also calls get_download_directory settings.CALACCESS_DOWNLOAD_DIR = self.data_dir else: self.data_dir = get_download_directory() os.path.exists(self.data_dir) or os.makedirs(self.data_dir) self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') self.zip_metadata_path = os.path.join(self.data_dir, '.lastdownload') self.tsv_dir = os.path.join(self.data_dir, "tsv/") # Immediately check that the tsv directory exists when using test data, # so we can stop immediately. if options['test_data']: if not os.path.exists(self.tsv_dir): raise CommandError("Data tsv directory does not exist " "at %s" % self.tsv_dir) elif self.verbosity: self.log("Using test data") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir) if options['download']: call_command( "downloadcalaccessrawdata", keep_files=self.keep_files, verbosity=self.verbosity, resume=options['resume'], noinput=options['noinput'] ) # execute the other steps that haven't been skipped if options['clean']: self.clean() if options['load']: self.load() if self.verbosity: self.success("Done!")
def handle(self, *args, **options): super(Command, self).handle(*args, **options) # Set options self.data_dir = get_download_directory() self.test_data_dir = get_test_download_directory() self.tsv_dir = os.path.join(self.data_dir, "tsv/") self.sample_dir = os.path.join(self.test_data_dir, "tsv/") self.sample_rows = int(options['samplerows']) self.tsv_list = os.listdir(self.tsv_dir) self.verbosity = int(options['verbosity']) self.header("Sampling %i rows from %s source files" % ( self.sample_rows, len(self.tsv_list), )) # Make sure sample dir exists and is empty os.path.exists(self.test_data_dir) or os.makedirs(self.test_data_dir) os.path.exists(self.sample_dir) and shutil.rmtree(self.sample_dir) os.makedirs(self.sample_dir) # Loop through all the files in the source directory for name in progress.bar(self.tsv_list): # Find the input file = os.path.join(self.tsv_dir, name) out_file = os.path.join(self.sample_dir, name) if self.verbosity > 2: self.log(" Sampling %s" % file) # Open the file fi = FileInput(file, True) # Generate our sample sample = two_pass_sample(fi, sample_size=self.sample_rows) # Open our output file with open(out_file, 'wb') as out: # Write it out for line in chain(fi.header, sample): out.write(line) self.header("Compressing zip file...") self.save_zip() # Stash the release_datetime and size of the last completed download version = self.command_logs.filter( command='downloadcalaccessrawdata', finish_datetime__isnull=False ).order_by('-start_datetime')[0].version with open(self.test_data_dir + '/sampled_version.txt', 'w') as f: f.write(str(version.release_datetime) + '\n') f.write(str(version.size))
def handle(self, *args, **options): if options['test_data']: # disable the steps that don't apply to test data options["download"] = False options["unzip"] = False options["prep"] = False options["clear"] = False self.log("Using test data") tsv_dir = os.path.join(get_test_download_directory(), "tsv/") # if the directory doesn't exist, abort if not os.path.exists(tsv_dir): self.failure("Sampled data tsv directory does not \ exist at %s" % tsv_dir) return # Set the options self.set_options(*args, **options) # Get to work if options['download']: if options['noinput']: self.download() else: # Ensure stdout can handle Unicode data: http://bit.ly/1C3l4eV locale_encoding = locale.getpreferredencoding() old_stdout = sys.stdout sys.stdout = codecs.getwriter(locale_encoding)(sys.stdout) confirm = input(self.prompt) # Set things back to the way they were before continuing. sys.stdout = old_stdout if confirm != 'yes': self.failure("Download cancelled") return self.download() if options['unzip']: self.unzip() if options['prep']: self.prep() if options['clear']: self.clear() if options['clean']: self.clean() if options['load']: self.load() self.success("Done!")
def handle(self, *args, **options): super(Command, self).handle(*args, **options) # set / compute any attributes that multiple class methods need self.app_name = options["app_name"] self.keep_files = options["keep_files"] self.test_mode = options['test_data'] self.downloading = options['download'] self.cleaning = options['clean'] self.loading = options['load'] if self.test_mode: # if using test data, we don't need to download self.downloading = False # and always keep files when running test data self.keep_files = True self.data_dir = get_test_download_directory() # need to set this app-wide because cleancalaccessrawfile # also calls get_download_directory settings.CALACCESS_DOWNLOAD_DIR = self.data_dir else: self.data_dir = get_download_directory() os.path.exists(self.data_dir) or os.makedirs(self.data_dir) self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') self.tsv_dir = os.path.join(self.data_dir, "tsv/") # Immediately check that the tsv directory exists when using test data, # so we can stop immediately. if self.test_mode: if not os.path.exists(self.tsv_dir): raise CommandError("Data tsv directory does not exist " "at %s" % self.tsv_dir) elif self.verbosity: self.log("Using test data") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir) download_metadata = self.get_download_metadata() current_release_datetime = download_metadata['last-modified'] last_started_update = self.get_last_log() try: last_download = self.command_logs.filter( command='downloadcalaccessrawdata').order_by( '-start_datetime')[0] except IndexError: last_download = None up_to_date = False can_resume = False # if there's a previously started update if last_started_update: # if current release datetime matches version of last started update if current_release_datetime == last_started_update.version.release_datetime: # if the last update finished if last_started_update.finish_datetime: up_to_date = True else: # if the last update didn't finish # (but is still for the current version) can_resume = True # if the last started update didn't finish elif not last_started_update.finish_datetime: # can resume update of old version as long as skipping download if not self.downloading: can_resume = True # or if there is a last download elif last_download: # and last download's version matches the outstanding update version if last_download.version == last_started_update.version: # and last download completed if last_download.finish_datetime: can_resume = True if options['noinput']: # if not taking input and can resume, automatically go into resume mode self.resume_mode = can_resume else: prompt_context = dict( current_release_datetime=current_release_datetime, expected_size=size(download_metadata['content-length']), up_to_date=up_to_date, can_resume=can_resume, ) last_finished_update = self.get_last_log(finished=True) if last_finished_update: loaded_v = last_finished_update.version prompt_context['since_loaded_version'] = naturaltime( loaded_v.release_datetime) else: prompt_context['since_loaded_version'] = None prompt = render_to_string( 'calaccess_raw/updatecalaccessrawdata.txt', prompt_context, ) if can_resume: if self.confirm_proceed(prompt): self.resume_mode = True else: self.resume_mode = False if not self.confirm_proceed( 'Do you want re-start your update?\n'): raise CommandError("Update cancelled") else: self.resume_mode = False if not self.confirm_proceed(prompt): raise CommandError("Update cancelled") if not self.test_mode: if self.resume_mode: self.log_record = last_started_update else: # get or create a version # .get_or_create() throws IntegrityError try: version = self.raw_data_versions.get( release_datetime=current_release_datetime) except RawDataVersion.DoesNotExist: version = self.raw_data_versions.create( release_datetime=current_release_datetime, size=download_metadata['content-length']) # create a new log record self.log_record = self.command_logs.create( version=version, command=self, called_by=self.get_caller_log()) # if the user could have resumed but didn't force_restart_download = can_resume and not self.resume_mode # if not skipping download, and there's a previous download if self.downloading and last_download: # if not forcing a restart if not force_restart_download: # check if version we are updating is last one being downloaded if self.log_record.version == last_download.version: # if it finished if last_download.finish_datetime: self.log('Already downloaded.') self.downloading = False if self.downloading: call_command( "downloadcalaccessrawdata", keep_files=self.keep_files, verbosity=self.verbosity, noinput=True, restart=force_restart_download, ) if self.verbosity: self.duration() # execute the other steps that haven't been skipped if options['clean']: self.clean() if self.verbosity: self.duration() if options['load']: self.load() if self.verbosity: self.duration() if self.verbosity: self.success("Done!") if not self.test_mode: self.log_record.finish_datetime = datetime.now() self.log_record.save()
def set_options(self, *args, **kwargs): self.url = 'http://campaignfinance.cdn.sos.ca.gov/dbwebexport.zip' self.verbosity = int(kwargs['verbosity']) self.database = kwargs['database'] if kwargs['test_data']: self.data_dir = get_test_download_directory() settings.CALACCESS_DOWNLOAD_DIR = self.data_dir else: self.data_dir = get_download_directory() os.path.exists(self.data_dir) or os.makedirs(self.data_dir) self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') self.zip_metadata_path = os.path.join(self.data_dir, '.lastdownload') self.tsv_dir = os.path.join(self.data_dir, "tsv/") # Immediately check that the tsv directory exists when using test data, # so we can stop immediately. if kwargs['test_data']: if not os.path.exists(self.tsv_dir): raise CommandError("Data tsv directory does not exist " "at %s" % self.tsv_dir) elif self.verbosity: self.log("Using test data") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir) if kwargs['download']: self.download_metadata = self.get_download_metadata() self.local_metadata = self.get_local_metadata() total_size = self.download_metadata['content-length'] last_modified = self.download_metadata['last-modified'] last_download = self.local_metadata['last-download'] cur_size = 0 self.resume_download = (kwargs['resume-download'] and os.path.exists(self.zip_path)) if self.resume_download: # Make sure the downloaded chunk is newer than the # last update to the remote data. timestamp = os.path.getmtime(self.zip_path) chunk_datetime = datetime.fromtimestamp(timestamp, utc) self.resume_download = chunk_datetime > last_modified if self.resume_download: last_download = chunk_datetime cur_size = os.path.getsize(self.zip_path) prompt_context = dict( resuming=self.resume_download, already_downloaded=last_modified == last_download, last_modified=last_modified, last_download=last_download, time_ago=naturaltime(last_download), total_size=size(total_size), cur_size=size(cur_size), download_dir=self.data_dir, ) self.prompt = render_to_string( 'calaccess_raw/downloadcalaccessrawdata.txt', prompt_context, )
def set_options(self, *args, **kwargs): self.url = 'http://campaignfinance.cdn.sos.ca.gov/dbwebexport.zip' self.verbosity = int(kwargs['verbosity']) if kwargs['test_data']: self.data_dir = get_test_download_directory() settings.CALACCESS_DOWNLOAD_DIR = self.data_dir else: self.data_dir = get_download_directory() os.path.exists(self.data_dir) or os.makedirs(self.data_dir) self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') self.tsv_dir = os.path.join(self.data_dir, "tsv/") # Immediately check that the tsv directory exists when using test data, # so we can stop immediately. if kwargs['test_data']: if not os.path.exists(self.tsv_dir): raise CommandError("Data tsv directory does not exist " "at %s" % self.tsv_dir) elif self.verbosity: self.log("Using test data") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir) if kwargs['download']: self.download_metadata = self.get_download_metadata() self.local_metadata = self.get_local_metadata() total_size = self.download_metadata['content-length'] last_modified = self.download_metadata['last-modified'] last_download = self.local_metadata['last-download'] cur_size = 0 self.resume_download = (kwargs['resume-download'] and os.path.exists(self.zip_path)) if self.resume_download: # Make sure the downloaded chunk is newer than the # last update to the remote data. timestamp = os.path.getmtime(self.zip_path) chunk_datetime = datetime.fromtimestamp(timestamp, utc) self.resume_download = chunk_datetime > last_modified if self.resume_download: last_download = chunk_datetime cur_size = os.path.getsize(self.zip_path) prompt_context = dict( resuming=self.resume_download, already_downloaded=last_modified==last_download, last_modified=last_modified, last_download=last_download, time_ago=naturaltime(last_download), total_size=size(total_size), cur_size=size(cur_size), download_dir=self.data_dir, ) self.prompt = render_to_string( 'calaccess_raw/downloadcalaccessrawdata.txt', prompt_context, )
def handle(self, *args, **options): super(Command, self).handle(*args, **options) # set / compute any attributes that multiple class methods need self.app_name = options["app_name"] self.keep_files = options["keep_files"] self.test_mode = options['test_data'] self.downloading = options['download'] self.cleaning = options['clean'] self.loading = options['load'] self.noinput = options['noinput'] if self.test_mode: # and always keep files when running test data self.keep_files = True self.data_dir = get_test_download_directory() # need to set this app-wide because cleancalaccessrawfile # also calls get_download_directory settings.CALACCESS_DOWNLOAD_DIR = self.data_dir self.noinput = True else: self.data_dir = get_download_directory() os.path.exists(self.data_dir) or os.makedirs(self.data_dir) self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') self.tsv_dir = os.path.join(self.data_dir, "tsv/") # Immediately check that the tsv directory exists when using test data, # so we can stop immediately. if self.test_mode: if not os.path.exists(self.tsv_dir): raise CommandError("Data tsv directory does not exist " "at %s" % self.tsv_dir) elif self.verbosity: self.log("Using test data") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir) if self.test_mode: with open(self.data_dir + "/sampled_version.txt", "r") as f: current_release_datetime = f.readline() expected_size = f.readline() else: download_metadata = self.get_download_metadata() current_release_datetime = download_metadata['last-modified'] expected_size = download_metadata['content-length'] last_started_update = self.get_last_log() if self.test_mode: last_download = None else: try: last_download = self.command_logs.filter( command='downloadcalaccessrawdata' ).order_by('-start_datetime')[0] except IndexError: last_download = None up_to_date = False can_resume = False # if there's a previously started update if last_started_update: # if current release datetime matches version of last started update if current_release_datetime == last_started_update.version.release_datetime: # if the last update finished if last_started_update.finish_datetime: up_to_date = True else: # if the last update didn't finish # (but is still for the current version) can_resume = True # if the last started update didn't finish elif not last_started_update.finish_datetime: # can resume update of old version as long as skipping download if not self.downloading: can_resume = True # or if there is a last download elif last_download: # and last download's version matches the outstanding update version if last_download.version == last_started_update.version: # and last download completed if last_download.finish_datetime: can_resume = True if self.noinput: # if not taking input and can resume, automatically go into resume mode self.resume_mode = can_resume else: prompt_context = dict( current_release_datetime=current_release_datetime, expected_size=size(expected_size), up_to_date=up_to_date, can_resume=can_resume, ) last_finished_update = self.get_last_log(finished=True) if last_finished_update: loaded_v = last_finished_update.version prompt_context['since_loaded_version'] = naturaltime(loaded_v.release_datetime) else: prompt_context['since_loaded_version'] = None prompt = render_to_string( 'calaccess_raw/updatecalaccessrawdata.txt', prompt_context, ) if can_resume: if self.confirm_proceed(prompt): self.resume_mode = True else: self.resume_mode = False if not self.confirm_proceed('Do you want re-start your update?\n'): raise CommandError("Update cancelled") else: self.resume_mode = False if not self.confirm_proceed(prompt): raise CommandError("Update cancelled") if self.resume_mode: self.log_record = last_started_update else: # get or create a version # .get_or_create() throws IntegrityError try: version = self.raw_data_versions.get( release_datetime=current_release_datetime ) except RawDataVersion.DoesNotExist: version = self.raw_data_versions.create( release_datetime=current_release_datetime, size=expected_size ) # create a new log record self.log_record = self.command_logs.create( version=version, command=self, called_by=self.get_caller_log() ) # if the user could have resumed but didn't force_restart_download = can_resume and not self.resume_mode # if not skipping download, and there's a previous download if self.downloading and last_download: # if not forcing a restart if not force_restart_download: # check if version we are updating is last one being downloaded if self.log_record.version == last_download.version: # if it finished if last_download.finish_datetime: self.log('Already downloaded.') self.downloading = False if self.downloading: if self.test_mode: call_command( "downloadcalaccessrawdatatest", verbosity=self.verbosity, ) else: call_command( "downloadcalaccessrawdata", keep_files=self.keep_files, verbosity=self.verbosity, noinput=True, restart=force_restart_download, ) if self.verbosity: self.duration() # execute the other steps that haven't been skipped if options['clean']: self.clean() if self.verbosity: self.duration() if options['load']: self.load() if self.verbosity: self.duration() if self.verbosity: self.success("Done!") self.log_record.finish_datetime = now() self.log_record.save()
def handle(self, *args, **options): super(Command, self).handle(*args, **options) # set / compute any attributes that multiple class methods need self.app_name = options["app_name"] self.keep_files = options["keep_files"] if options['test_data']: # if using test data, we don't need to download options['download'] = False # and always keep files when running test data self.keep_files = True if options['test_data']: self.data_dir = get_test_download_directory() # need to set this app-wide because cleancalaccessrawfile # also calls get_download_directory settings.CALACCESS_DOWNLOAD_DIR = self.data_dir else: self.data_dir = get_download_directory() os.path.exists(self.data_dir) or os.makedirs(self.data_dir) self.zip_path = os.path.join(self.data_dir, 'calaccess.zip') self.tsv_dir = os.path.join(self.data_dir, "tsv/") # Immediately check that the tsv directory exists when using test data, # so we can stop immediately. if options['test_data']: if not os.path.exists(self.tsv_dir): raise CommandError("Data tsv directory does not exist " "at %s" % self.tsv_dir) elif self.verbosity: self.log("Using test data") self.csv_dir = os.path.join(self.data_dir, "csv/") os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir) download_metadata = self.get_download_metadata() self.current_release_datetime = download_metadata['last-modified'] self.last_update = self.get_last_log() self.resume_download = self.check_can_resume_download() self.log_record = None # if this isn't a test if not options['test_data']: # and there's a previous update if self.last_update: # which did not finish if not self.last_update.finish_datetime: # and either can resume download or skipping it altogether if self.resume_download or not options['download']: # can resume self.log_record = self.last_update # if not testing, but can't resume if not self.log_record: # get or create a version # .get_or_create() throws IntegrityError try: version = self.raw_data_versions.get( release_datetime=self.current_release_datetime ) except RawDataVersion.DoesNotExist: version = self.raw_data_versions.create( release_datetime=self.current_release_datetime, size=download_metadata['content-length'] ) # create a new log record self.log_record = self.command_logs.create( version=version, command=self, called_by=self.get_caller() ) if options['download']: call_command( "downloadcalaccessrawdata", keep_files=self.keep_files, verbosity=self.verbosity, resume=self.resume_download, noinput=options['noinput'], ) if self.verbosity: self.duration() # execute the other steps that haven't been skipped if options['clean']: self.clean() if self.verbosity: self.duration() if options['load']: self.load() if self.verbosity: self.duration() if self.verbosity: self.success("Done!") if not options['test_data']: self.log_record.finish_datetime = datetime.now() self.log_record.save()