Exemplo n.º 1
0
    def handle(self, *args, **options):
        """
        Make it happen.
        """
        super(Command, self).handle(*args, **options)
        self.clean_zip_path = os.path.join(
            get_download_directory(),
            'calaccess_cleaned.zip'
        )

        self.data_dir = get_download_directory()

        if os.path.exists(self.data_dir):
            shutil.rmtree(self.data_dir)

        os.makedirs(self.data_dir)

        versions = RawDataVersion.objects.filter(clean_zip_archive='')

        if versions:
            for version in versions:
                logger.debug(
                    'Creating zip file for {:%Y-%m-%d_%H-%M-%S} version'.format(
                        version.release_datetime
                    )
                )

                self.download_clean_files(version)
                self.create_zip_file(version)
                self.archive_zip_file(version)
    def handle(self, *args, **options):
        # Parse command-line options
        self.verbosity = int(options['verbosity'])
        self.max_lines_per_load = int(options['max_lines_per_load'])
        if options['agencies'] is None:
            self.agencies = []
        else:
            self.agencies = options['agencies'].split(',')
        if options['years'] is None:
            self.years = []
        else:
            self.years = options['years'].split(',')
        self.force = options['force']

        # Compute properties
        self.data_dir = os.path.join(get_download_directory(), 'csv')
        self.combined_csv_path = os.path.join(
            self.data_dir, 'netfile_cal201_transaction.csv')
        self.connect2 = Connect2API()

        # Run the thing!
        if not options['skip_download']:
            self.download()

        if not options['skip_combine']:
            self.combine()

        if not options['skip_load']:
            self.cursor = connection.cursor()
            self.load()
Exemplo n.º 3
0
    def set_options(self, *args, **kwargs):
        self.url = 'http://campaignfinance.cdn.sos.ca.gov/dbwebexport.zip'

        if kwargs['test_data']:
            self.data_dir = get_test_download_directory()
            settings.CALACCESS_DOWNLOAD_DIR = self.data_dir
        else:
            self.data_dir = get_download_directory()

        os.path.exists(self.data_dir) or os.mkdir(self.data_dir)
        self.zip_path = os.path.join(self.data_dir, 'calaccess.zip')
        self.tsv_dir = os.path.join(self.data_dir, "tsv/")
        self.csv_dir = os.path.join(self.data_dir, "csv/")
        os.path.exists(self.csv_dir) or os.mkdir(self.csv_dir)
        if kwargs['download']:
            self.download_metadata = self.get_download_metadata()
            self.local_metadata = self.get_local_metadata()
            prompt_context = dict(
                last_updated=self.download_metadata['last-modified'],
                time_ago=naturaltime(self.download_metadata['last-modified']),
                size=size(self.download_metadata['content-length']),
                last_download=self.local_metadata['last-download'],
                download_dir=self.data_dir,
            )
            self.prompt = render_to_string(
                'calaccess_raw/downloadcalaccessrawdata.txt',
                prompt_context,
            )
        self.verbosity = int(kwargs['verbosity'])
    def set_options(self, *args, **kwargs):
        self.url = 'http://campaignfinance.cdn.sos.ca.gov/dbwebexport.zip'
        self.verbosity = int(kwargs['verbosity'])

        if kwargs['test_data']:
            self.data_dir = get_test_download_directory()
            settings.CALACCESS_DOWNLOAD_DIR = self.data_dir
            if self.verbosity:
                self.log("Using test data")
        else:
            self.data_dir = get_download_directory()

        os.path.exists(self.data_dir) or os.makedirs(self.data_dir)
        self.zip_path = os.path.join(self.data_dir, 'calaccess.zip')
        self.tsv_dir = os.path.join(self.data_dir, "tsv/")
        self.csv_dir = os.path.join(self.data_dir, "csv/")
        os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir)
        if kwargs['download']:
            self.download_metadata = self.get_download_metadata()
            self.local_metadata = self.get_local_metadata()
            prompt_context = dict(
                last_updated=self.download_metadata['last-modified'],
                time_ago=naturaltime(self.download_metadata['last-modified']),
                size=size(self.download_metadata['content-length']),
                last_download=self.local_metadata['last-download'],
                download_dir=self.data_dir,
            )
            self.prompt = render_to_string(
                'calaccess_raw/downloadcalaccessrawdata.txt',
                prompt_context,
            )
    def handle(self, *args, **options):
        # Parse command-line options
        self.verbosity = int(options['verbosity'])
        self.max_lines_per_load = int(options['max_lines_per_load'])
        if options['agencies'] is None:
            self.agencies = []
        else:
            self.agencies = options['agencies'].split(',')
        if options['years'] is None:
            self.years = []
        else:
            self.years = options['years'].split(',')
        self.force = options['force']

        # Compute properties
        self.data_dir = os.path.join(get_download_directory(), 'csv')
        self.combined_csv_path = os.path.join(
            self.data_dir, 'netfile_cal201_transaction.csv')
        self.connect2 = Connect2API()

        # Run the thing!
        if not options['skip_download']:
            self.download()

        if not options['skip_combine']:
            self.combine()

        if not options['skip_load']:
            self.cursor = connection.cursor()
            self.load()
Exemplo n.º 6
0
    def handle(self, *args, **options):
        """
        Sets options common to all commands.

        Any command subclassing this object should implement its own
        handle method, as is standard in Django, and run this method
        via a super call to inherit its functionality.
        """
        # Set global options
        self.verbosity = options.get("verbosity")
        self.no_color = options.get("no_color")

        # Start the clock
        self.start_datetime = timezone.now()

        # set up processed data directory
        self.data_dir = get_download_directory()
        self.processed_data_dir = os.path.join(
            self.data_dir,
            'processed',
        )
        if not os.path.exists(self.processed_data_dir):
            # make the processed data director
            os.makedirs(self.processed_data_dir)
            # set permissions to allow other users to write and execute
            os.chmod(self.processed_data_dir, 0o703)
 def set_options(self, *args, **kwargs):
     self.url = 'http://campaignfinance.cdn.sos.ca.gov/dbwebexport.zip'
     self.data_dir = get_download_directory()
     os.path.exists(self.data_dir) or os.mkdir(self.data_dir)
     self.zip_path = os.path.join(self.data_dir, 'calaccess.zip')
     self.tsv_dir = os.path.join(self.data_dir, "tsv/")
     self.csv_dir = os.path.join(self.data_dir, "csv/")
     os.path.exists(self.csv_dir) or os.mkdir(self.csv_dir)
     if kwargs['download']:
         self.download_metadata = self.get_download_metadata()
         self.local_metadata = self.get_local_metadata()
         prompt_context = dict(
             last_updated=self.download_metadata['last-modified'],
             time_ago=naturaltime(self.download_metadata['last-modified']),
             size=size(self.download_metadata['content-length']),
             last_download=self.local_metadata['last-download'],
             download_dir=self.data_dir,
         )
         self.prompt = render_to_string(
             'calaccess_raw/downloadcalaccessrawdata.txt',
             prompt_context,
         )
         self.pbar = progressbar.ProgressBar(
             widgets=[
                 progressbar.Percentage(),
                 progressbar.Bar(),
                 ' ',
                 progressbar.ETA(),
                 ' ',
                 progressbar.FileTransferSpeed()
             ],
             maxval=self.download_metadata['content-length']
         )
     self.verbosity = int(kwargs['verbosity'])
 def set_config(self, *args, **options):
     self.data_dir = get_download_directory()
     self.test_data_dir = get_test_download_directory()
     self.tsv_dir = os.path.join(self.data_dir, "tsv/")
     self.sample_dir = os.path.join(self.test_data_dir, "tsv/")
     self.sample_rows = int(options['samplerows'])
     self.tsv_list = os.listdir(self.tsv_dir)
     self.verbosity = int(options['verbosity'])
 def handle_label(self, label, **options):
     # Set options
     self.verbosity = options.get("verbosity")
     self.data_dir = get_download_directory()
     self.tsv_dir = os.path.join(self.data_dir, "tsv/")
     self.csv_dir = os.path.join(self.data_dir, "csv/")
     # Do it
     self.clean(label)
 def set_config(self, *args, **options):
     self.data_dir = get_download_directory()
     self.test_data_dir = get_test_download_directory()
     self.tsv_dir = os.path.join(self.data_dir, "tsv/")
     self.sample_dir = os.path.join(self.test_data_dir, "tsv/")
     self.sample_rows = int(options['samplerows'])
     self.tsv_list = os.listdir(self.tsv_dir)
     self.verbosity = int(options['verbosity'])
def get_download_directory():
    """
    Returns the download directory where we will store downloaded data.
    """
    if hasattr(settings, 'NETFILE_DOWNLOAD_DIR'):
        return getattr(settings, 'NETFILE_DOWNLOAD_DIR')
    else:
        return calaccess_raw.get_download_directory()
Exemplo n.º 12
0
 def handle(self, *args, **options):
     self.header("Loading summary totals")
     self.data_dir = get_download_directory()
     self.source_csv = os.path.join(self.data_dir, 'csv', 'smry_cd.csv')
     self.target_csv = os.path.join(self.data_dir, 'csv',
                                    'smry_cd_transformed.csv')
     self.transform_csv()
     self.load_csv()
def get_download_directory():
    """
    Returns the download directory where we will store downloaded data.
    """
    if hasattr(settings, 'NETFILE_DOWNLOAD_DIR'):
        return getattr(settings, 'NETFILE_DOWNLOAD_DIR')
    else:
        return calaccess_raw.get_download_directory()
 def handle_label(self, label, **options):
     # Set options
     self.verbosity = int(options.get("verbosity"))
     self.data_dir = get_download_directory()
     self.tsv_dir = os.path.join(self.data_dir, "tsv/")
     self.csv_dir = os.path.join(self.data_dir, "csv/")
     self.log_dir = os.path.join(self.data_dir, "log/")
     # Do it
     self.clean(label)
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        # Set options
        self.data_dir = get_download_directory()
        self.test_data_dir = get_test_download_directory()
        self.tsv_dir = os.path.join(self.data_dir, "tsv/")
        self.sample_dir = os.path.join(self.test_data_dir, "tsv/")
        self.sample_rows = int(options['samplerows'])
        self.tsv_list = os.listdir(self.tsv_dir)
        self.verbosity = int(options['verbosity'])

        self.header("Sampling %i rows from %s source files" % (
            self.sample_rows,
            len(self.tsv_list),
        ))

        # Make sure sample dir exists and is empty
        os.path.exists(self.test_data_dir) or os.makedirs(self.test_data_dir)
        os.path.exists(self.sample_dir) and shutil.rmtree(self.sample_dir)
        os.makedirs(self.sample_dir)

        # Loop through all the files in the source directory
        for name in progress.bar(self.tsv_list):

            # Find the input
            file = os.path.join(self.tsv_dir, name)
            out_file = os.path.join(self.sample_dir, name)

            if self.verbosity > 2:
                self.log(" Sampling %s" % file)

            # Open the file
            fi = FileInput(file, True)

            # Generate our sample
            sample = two_pass_sample(fi, sample_size=self.sample_rows)

            # Open our output file
            with open(out_file, 'wb') as out:

                # Write it out
                for line in chain(fi.header, sample):
                    out.write(line)

        self.header("Compressing zip file...")
        self.save_zip()

        # Stash the release_datetime and size of the last completed download
        version = self.command_logs.filter(
            command='downloadcalaccessrawdata',
            finish_datetime__isnull=False
        ).order_by('-start_datetime')[0].version

        with open(self.test_data_dir + '/sampled_version.txt', 'w') as f:
            f.write(str(version.release_datetime) + '\n')
            f.write(str(version.size))
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        # set / compute any attributes that multiple class methods need
        self.app_name = options["app_name"]
        self.database = options["database"]
        self.keep_files = options["keep_files"]

        if options['test_data']:
            # if using test data, we don't need to download
            options["download"] = False
            # and always keep files when running test data
            self.keep_files = True

        if options['test_data']:
            self.data_dir = get_test_download_directory()
            # need to set this app-wide because cleancalaccessrawfile
            #   also calls get_download_directory
            settings.CALACCESS_DOWNLOAD_DIR = self.data_dir
        else:
            self.data_dir = get_download_directory()

        os.path.exists(self.data_dir) or os.makedirs(self.data_dir)
        self.zip_path = os.path.join(self.data_dir, 'calaccess.zip')
        self.zip_metadata_path = os.path.join(self.data_dir,
                                              '.lastdownload')
        self.tsv_dir = os.path.join(self.data_dir, "tsv/")

        # Immediately check that the tsv directory exists when using test data,
        #   so we can stop immediately.
        if options['test_data']:
            if not os.path.exists(self.tsv_dir):
                raise CommandError("Data tsv directory does not exist "
                                   "at %s" % self.tsv_dir)
            elif self.verbosity:
                self.log("Using test data")

        self.csv_dir = os.path.join(self.data_dir, "csv/")
        os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir)

        if options['download']:

            call_command(
                "downloadcalaccessrawdata",
                keep_files=self.keep_files,
                verbosity=self.verbosity,
                resume=options['resume'],
                noinput=options['noinput']
            )

        # execute the other steps that haven't been skipped
        if options['clean']:
            self.clean()
        if options['load']:
            self.load()

        if self.verbosity:
            self.success("Done!")
Exemplo n.º 17
0
    def set_options(self, *args, **kwargs):
        self.data_dir = os.path.join(get_download_directory(), 'csv')

        # Make sure directory exists
        os.path.exists(self.data_dir) or os.mkdir(self.data_dir)

        self.cursor = connection.cursor()
        # Quarterlies stuff
        self.quarterly_tmp_csv = tempfile.NamedTemporaryFile().name
        self.quarterly_target_csv = os.path.join(self.data_dir,
                                                 'expn_cd_transformed.csv')
 def handle(self, *args, **options):
     self.header("Loading summary totals")
     self.data_dir = get_download_directory()
     self.source_csv = os.path.join(self.data_dir, 'csv', 'smry_cd.csv')
     self.target_csv = os.path.join(
         self.data_dir,
         'csv',
         'smry_cd_transformed.csv'
     )
     self.transform_csv()
     self.load_csv()
 def set_options(self, *args, **kwargs):
     self.data_dir = os.path.join(get_download_directory(), 'csv')
     self.cursor = connection.cursor()
     # Quarterlies stuff
     self.quarterly_tmp_csv = tempfile.NamedTemporaryFile().name
     self.quarterly_target_csv = os.path.join(self.data_dir,
                                              'rcpt_cd_transformed.csv')
     # Late filings stuff
     self.late_tmp_csv = tempfile.NamedTemporaryFile().name
     self.late_target_csv = os.path.join(self.data_dir,
                                         's497_cd_transformed.csv')
     self.late_tmp_table = "TMP_%s" % S497Cd._meta.db_table
    def handle(self, *args, **options):
        self.verbosity = int(options['verbosity'])
        self.max_lines_per_load = int(options.get('max_lines_per_load', 1000))
        self.data_dir = os.path.join(get_download_directory(), 'csv')
        self.zip_path = os.path.join(self.data_dir, 'zipcode_metro.zip')

        if not options['skip_download']:
            self.download()

        if not options['skip_load']:
            self.cursor = connection.cursor()
            self.load()
Exemplo n.º 21
0
    def handle(self, *args, **options):
        self.csv = None
        self.database = options['database']
        self.verbosity = int(options['verbosity'])
        self.data_dir = os.path.join(get_download_directory(), 'csv')
        self.zip_path = os.path.join(self.data_dir, 'zipcode_metro.zip')

        if not options['skip_download']:
            self.download()

        if not options['skip_load']:
            self.load()
Exemplo n.º 22
0
    def handle(self, *args, **options):
        self.csv = None
        self.database = options['database']
        self.verbosity = int(options['verbosity'])
        self.data_dir = os.path.join(get_download_directory(), 'csv')
        self.zip_path = os.path.join(self.data_dir, 'zipcode_metro.zip')

        if not options['skip_download']:
            self.download()

        if not options['skip_load']:
            self.load()
Exemplo n.º 23
0
    def handle(self, *args, **options):
        self.verbosity = int(options['verbosity'])
        self.max_lines_per_load = int(options.get('max_lines_per_load', 1000))
        self.data_dir = os.path.join(get_download_directory(), 'csv')
        self.zip_path = os.path.join(self.data_dir, 'zipcode_metro.zip')

        if not options['skip_download']:
            self.download()

        if not options['skip_load']:
            self.cursor = connection.cursor()
            self.load()
    def set_options(self, *args, **kwargs):
        self.data_dir = os.path.join(get_download_directory(), 'csv')

        # Make sure directory exists
        os.path.exists(self.data_dir) or os.mkdir(self.data_dir)

        self.cursor = connection.cursor()
        # Quarterlies stuff
        self.quarterly_tmp_csv = tempfile.NamedTemporaryFile().name
        self.quarterly_target_csv = os.path.join(
            self.data_dir,
            'expn_cd_transformed.csv'
        )
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        # Set options
        self.file_name = options['file_name']
        self.data_dir = get_download_directory()
        self.tsv_dir = os.path.join(self.data_dir, "tsv/")
        self.csv_dir = os.path.join(self.data_dir, "csv/")
        self.log_dir = os.path.join(self.data_dir, "log/")

        if self.verbosity > 2:
            self.log(" Cleaning %s" % self.file_name)

        caller = self.get_caller()

        if caller:
            # if called by another command, use its version record
            self.version = caller.version
            self.log_record = self.command_logs.create(
                version=self.version,
                command=self,
                called_by=caller,
                file_name=self.file_name.upper().replace('.TSV', '')
            )
        else:
            # try getting the most recent version
            try:
                self.version = self.raw_data_versions.latest('release_datetime')
            except RawDataVersion.DoesNotExist:
                # if there's no version, assume this is a test and do not log
                # TODO: Figure out a more direct way to handle this
                self.version = None
            else:
                self.log_record = self.command_logs.create(
                    # if called by another command, use it's version
                    version=self.version,
                    command=self,
                    file_name=self.file_name.upper().replace('.TSV', '')
                )

        self.clean(options['file_name'])

        # unless keeping files, remove tsv files
        if not options['keep_files']:
            os.remove(os.path.join(self.tsv_dir, options['file_name']))

        if self.version:
            # save the log record
            self.log_record.finish_datetime = datetime.now()
            self.log_record.save()
Exemplo n.º 26
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        # Set options
        self.file_name = options['file_name']
        self.data_dir = get_download_directory()
        self.tsv_dir = os.path.join(self.data_dir, "tsv/")
        self.csv_dir = os.path.join(self.data_dir, "csv/")
        self.log_dir = os.path.join(self.data_dir, "log/")

        if self.verbosity > 2:
            self.log(" Cleaning %s" % self.file_name)

        caller = self.get_caller_log()

        if caller:
            # if called by another command, use its version record
            self.version = caller.version
            self.log_record = self.command_logs.create(
                version=self.version,
                command=self,
                called_by=caller,
                file_name=self.file_name.upper().replace('.TSV', '')
            )
        else:
            # try getting the most recent version
            try:
                self.version = self.raw_data_versions.latest('release_datetime')
            except RawDataVersion.DoesNotExist:
                # if there's no version, assume this is a test and do not log
                # TODO: Figure out a more direct way to handle this
                self.version = None
            else:
                self.log_record = self.command_logs.create(
                    # if called by another command, use it's version
                    version=self.version,
                    command=self,
                    file_name=self.file_name.upper().replace('.TSV', '')
                )

        self.clean(options['file_name'])

        # unless keeping files, remove tsv files
        if not options['keep_files']:
            os.remove(os.path.join(self.tsv_dir, options['file_name']))

        if self.version:
            # save the log record
            self.log_record.finish_datetime = datetime.now()
            self.log_record.save()
 def set_options(self, *args, **kwargs):
     self.data_dir = os.path.join(get_download_directory(), 'csv')
     self.cursor = connection.cursor()
     # Quarterlies stuff
     self.quarterly_tmp_csv = tempfile.NamedTemporaryFile().name
     self.quarterly_target_csv = os.path.join(
         self.data_dir,
         'rcpt_cd_transformed.csv'
     )
     # Late filings stuff
     self.late_tmp_csv = tempfile.NamedTemporaryFile().name
     self.late_target_csv = os.path.join(
         self.data_dir,
         's497_cd_transformed.csv'
     )
     self.late_tmp_table = "TMP_%s" % S497Cd._meta.db_table
    def handle(self, *args, **options):
        self.verbosity = int(options['verbosity'])
        self.max_lines_per_load = int(options.get('max_lines_per_load', 1000))
        self.data_dir = os.path.join(get_download_directory(), 'csv')
        self.combined_csv_path = os.path.join(
            self.data_dir, 'netfile_cal201_transaction.csv')
        self.connect2 = Connect2API()

        if not options['skip_download']:
            self.download()

        if not options['skip_combine']:
            self.combine()

        if not options['skip_load']:
            self.cursor = connection.cursor()
            self.load()
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        # get the dir were data goes from app settings
        self.data_dir = get_download_directory()
        # if data_dir doesn't exist, create it
        os.path.exists(self.data_dir) or os.makedirs(self.data_dir)

        # downloaded zipfile will go in data_dir
        self.zip_path = os.path.join(self.data_dir, 'calaccess.zip')
        # so will the file where we track the last download
        self.zip_metadata_path = os.path.join(
            self.data_dir,
            '.lastdownload'
        )

        self.tsv_dir = os.path.join(self.data_dir, "tsv/")

        self.csv_dir = os.path.join(self.data_dir, "csv/")
        os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir)

        self.download_metadata = self.get_download_metadata()
        self.local_metadata = self.get_local_metadata()

        total_size = self.download_metadata['content-length']
        last_modified = self.download_metadata['last-modified']
        last_download = self.local_metadata['last-download']
        cur_size = 0

        # if the user tries to resume, also have to make sure there is a zip file
        self.resume_download = (options['resume'] and os.path.exists(self.zip_path))

        if self.resume_download:
            # Make sure the downloaded chunk is newer than the
            #   last update to the remote data.
            timestamp = os.path.getmtime(self.zip_path)
            chunk_datetime = datetime.fromtimestamp(timestamp, utc)
            self.resume_download = chunk_datetime > last_modified
            # reset this vars if still resuming
            if self.resume_download:
                last_download = chunk_datetime
                cur_size = os.path.getsize(self.zip_path)

        # setting up the prompt
        prompt_context = dict(
            resuming=self.resume_download,
            already_downloaded=last_modified == last_download,
            last_modified=last_modified,
            last_download=last_download,
            time_ago=naturaltime(last_download),
            total_size=size(total_size),
            cur_size=size(cur_size),
            download_dir=self.data_dir,
        )

        self.prompt = render_to_string(
            'calaccess_raw/downloadcalaccessrawdata.txt',
            prompt_context,
        )

        # If we're taking user input, make sure the user says exactly 'yes'
        if not options['noinput'] and self.confirm_download() != 'yes':
            self.failure("Download cancelled")
            return

        self.download()
        self.unzip()

        if not options['keep_files']:
            os.remove(self.zip_path)

        self.prep()

        if not options['keep_files']:
            shutil.rmtree(os.path.join(self.data_dir, 'CalAccess'))
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        # set / compute any attributes that multiple class methods need
        self.app_name = options["app_name"]
        self.keep_files = options["keep_files"]

        if options['test_data']:
            # if using test data, we don't need to download
            options['download'] = False
            # and always keep files when running test data
            self.keep_files = True

        if options['test_data']:
            self.data_dir = get_test_download_directory()
            # need to set this app-wide because cleancalaccessrawfile
            #   also calls get_download_directory
            settings.CALACCESS_DOWNLOAD_DIR = self.data_dir
        else:
            self.data_dir = get_download_directory()

        os.path.exists(self.data_dir) or os.makedirs(self.data_dir)
        self.zip_path = os.path.join(self.data_dir, 'calaccess.zip')
        self.tsv_dir = os.path.join(self.data_dir, "tsv/")

        # Immediately check that the tsv directory exists when using test data,
        #   so we can stop immediately.
        if options['test_data']:
            if not os.path.exists(self.tsv_dir):
                raise CommandError("Data tsv directory does not exist "
                                   "at %s" % self.tsv_dir)
            elif self.verbosity:
                self.log("Using test data")

        self.csv_dir = os.path.join(self.data_dir, "csv/")
        os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir)

        download_metadata = self.get_download_metadata()
        self.current_release_datetime = download_metadata['last-modified']
        self.last_update = self.get_last_log()
        self.resume_download = self.check_can_resume_download()
        self.log_record = None

        # if this isn't a test
        if not options['test_data']:
            # and there's a previous update
            if self.last_update:
                # which did not finish
                if not self.last_update.finish_datetime:
                    # and either can resume download or skipping it altogether
                    if self.resume_download or not options['download']:
                        # can resume
                        self.log_record = self.last_update

            # if not testing, but can't resume
            if not self.log_record:
                # get or create a version
                # .get_or_create() throws IntegrityError
                try:
                    version = self.raw_data_versions.get(
                        release_datetime=self.current_release_datetime
                    )
                except RawDataVersion.DoesNotExist:
                    version = self.raw_data_versions.create(
                        release_datetime=self.current_release_datetime,
                        size=download_metadata['content-length']
                    )
                # create a new log record
                self.log_record = self.command_logs.create(
                    version=version,
                    command=self,
                    called_by=self.get_caller()
                )

        if options['download']:
            call_command(
                "downloadcalaccessrawdata",
                keep_files=self.keep_files,
                verbosity=self.verbosity,
                resume=self.resume_download,
                noinput=options['noinput'],
            )
            if self.verbosity:
                self.duration()

        # execute the other steps that haven't been skipped
        if options['clean']:
            self.clean()
            if self.verbosity:
                self.duration()

        if options['load']:
            self.load()
            if self.verbosity:
                self.duration()

        if self.verbosity:
            self.success("Done!")

        if not options['test_data']:
            self.log_record.finish_datetime = datetime.now()
            self.log_record.save()
Exemplo n.º 31
0
 def get_tsv_path(self):
     return os.path.join(
         get_download_directory(),
         'tsv',
         self.get_tsv_name()
     )
Exemplo n.º 32
0
 def get_tsv_path(self):
     return os.path.join(get_download_directory(), 'tsv',
                         self.get_tsv_name())
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        # set / compute any attributes that multiple class methods need
        self.app_name = options["app_name"]
        self.keep_files = options["keep_files"]
        self.test_mode = options['test_data']
        self.downloading = options['download']
        self.cleaning = options['clean']
        self.loading = options['load']

        if self.test_mode:
            # if using test data, we don't need to download
            self.downloading = False
            # and always keep files when running test data
            self.keep_files = True
            self.data_dir = get_test_download_directory()
            # need to set this app-wide because cleancalaccessrawfile
            #   also calls get_download_directory
            settings.CALACCESS_DOWNLOAD_DIR = self.data_dir
        else:
            self.data_dir = get_download_directory()

        os.path.exists(self.data_dir) or os.makedirs(self.data_dir)
        self.zip_path = os.path.join(self.data_dir, 'calaccess.zip')
        self.tsv_dir = os.path.join(self.data_dir, "tsv/")

        # Immediately check that the tsv directory exists when using test data,
        #   so we can stop immediately.
        if self.test_mode:
            if not os.path.exists(self.tsv_dir):
                raise CommandError("Data tsv directory does not exist "
                                   "at %s" % self.tsv_dir)
            elif self.verbosity:
                self.log("Using test data")

        self.csv_dir = os.path.join(self.data_dir, "csv/")
        os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir)

        download_metadata = self.get_download_metadata()
        current_release_datetime = download_metadata['last-modified']
        last_started_update = self.get_last_log()

        try:
            last_download = self.command_logs.filter(
                command='downloadcalaccessrawdata').order_by(
                    '-start_datetime')[0]
        except IndexError:
            last_download = None

        up_to_date = False
        can_resume = False

        # if there's a previously started update
        if last_started_update:
            # if current release datetime matches version of last started update
            if current_release_datetime == last_started_update.version.release_datetime:
                # if the last update finished
                if last_started_update.finish_datetime:
                    up_to_date = True
                else:
                    # if the last update didn't finish
                    # (but is still for the current version)
                    can_resume = True
            # if the last started update didn't finish
            elif not last_started_update.finish_datetime:
                # can resume update of old version as long as skipping download
                if not self.downloading:
                    can_resume = True
                # or if there is a last download
                elif last_download:
                    # and last download's version matches the outstanding update version
                    if last_download.version == last_started_update.version:
                        # and last download completed
                        if last_download.finish_datetime:
                            can_resume = True

        if options['noinput']:
            # if not taking input and can resume, automatically go into resume mode
            self.resume_mode = can_resume
        else:
            prompt_context = dict(
                current_release_datetime=current_release_datetime,
                expected_size=size(download_metadata['content-length']),
                up_to_date=up_to_date,
                can_resume=can_resume,
            )

            last_finished_update = self.get_last_log(finished=True)

            if last_finished_update:
                loaded_v = last_finished_update.version
                prompt_context['since_loaded_version'] = naturaltime(
                    loaded_v.release_datetime)
            else:
                prompt_context['since_loaded_version'] = None

            prompt = render_to_string(
                'calaccess_raw/updatecalaccessrawdata.txt',
                prompt_context,
            )

            if can_resume:
                if self.confirm_proceed(prompt):
                    self.resume_mode = True
                else:
                    self.resume_mode = False
                    if not self.confirm_proceed(
                            'Do you want re-start your update?\n'):
                        raise CommandError("Update cancelled")
            else:
                self.resume_mode = False
                if not self.confirm_proceed(prompt):
                    raise CommandError("Update cancelled")

        if not self.test_mode:
            if self.resume_mode:
                self.log_record = last_started_update
            else:
                # get or create a version
                # .get_or_create() throws IntegrityError
                try:
                    version = self.raw_data_versions.get(
                        release_datetime=current_release_datetime)
                except RawDataVersion.DoesNotExist:
                    version = self.raw_data_versions.create(
                        release_datetime=current_release_datetime,
                        size=download_metadata['content-length'])
                # create a new log record
                self.log_record = self.command_logs.create(
                    version=version,
                    command=self,
                    called_by=self.get_caller_log())

        # if the user could have resumed but didn't
        force_restart_download = can_resume and not self.resume_mode

        # if not skipping download, and there's a previous download
        if self.downloading and last_download:
            # if not forcing a restart
            if not force_restart_download:
                # check if version we are updating is last one being downloaded
                if self.log_record.version == last_download.version:
                    # if it finished
                    if last_download.finish_datetime:
                        self.log('Already downloaded.')
                        self.downloading = False

        if self.downloading:
            call_command(
                "downloadcalaccessrawdata",
                keep_files=self.keep_files,
                verbosity=self.verbosity,
                noinput=True,
                restart=force_restart_download,
            )
            if self.verbosity:
                self.duration()

        # execute the other steps that haven't been skipped
        if options['clean']:
            self.clean()
            if self.verbosity:
                self.duration()

        if options['load']:
            self.load()
            if self.verbosity:
                self.duration()

        if self.verbosity:
            self.success("Done!")

        if not self.test_mode:
            self.log_record.finish_datetime = datetime.now()
            self.log_record.save()
    def set_options(self, *args, **kwargs):
        self.url = 'http://campaignfinance.cdn.sos.ca.gov/dbwebexport.zip'
        self.verbosity = int(kwargs['verbosity'])
        self.database = kwargs['database']

        if kwargs['test_data']:
            self.data_dir = get_test_download_directory()
            settings.CALACCESS_DOWNLOAD_DIR = self.data_dir
        else:
            self.data_dir = get_download_directory()

        os.path.exists(self.data_dir) or os.makedirs(self.data_dir)
        self.zip_path = os.path.join(self.data_dir, 'calaccess.zip')
        self.zip_metadata_path = os.path.join(self.data_dir,
                                              '.lastdownload')
        self.tsv_dir = os.path.join(self.data_dir, "tsv/")

        # Immediately check that the tsv directory exists when using test data,
        #   so we can stop immediately.
        if kwargs['test_data']:
            if not os.path.exists(self.tsv_dir):
                raise CommandError("Data tsv directory does not exist "
                                   "at %s" % self.tsv_dir)
            elif self.verbosity:
                self.log("Using test data")

        self.csv_dir = os.path.join(self.data_dir, "csv/")
        os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir)
        if kwargs['download']:
            self.download_metadata = self.get_download_metadata()
            self.local_metadata = self.get_local_metadata()

            total_size = self.download_metadata['content-length']
            last_modified = self.download_metadata['last-modified']
            last_download = self.local_metadata['last-download']
            cur_size = 0

            self.resume_download = (kwargs['resume-download'] and
                                    os.path.exists(self.zip_path))

            if self.resume_download:
                # Make sure the downloaded chunk is newer than the
                # last update to the remote data.
                timestamp = os.path.getmtime(self.zip_path)
                chunk_datetime = datetime.fromtimestamp(timestamp, utc)
                self.resume_download = chunk_datetime > last_modified
                if self.resume_download:
                    last_download = chunk_datetime
                    cur_size = os.path.getsize(self.zip_path)

            prompt_context = dict(
                resuming=self.resume_download,
                already_downloaded=last_modified == last_download,
                last_modified=last_modified,
                last_download=last_download,
                time_ago=naturaltime(last_download),
                total_size=size(total_size),
                cur_size=size(cur_size),
                download_dir=self.data_dir,
            )

            self.prompt = render_to_string(
                'calaccess_raw/downloadcalaccessrawdata.txt',
                prompt_context,
            )
    def handle(self, *args, **options):
        """
        Make it happen.
        """
        super(Command, self).handle(*args, **options)

        # get the most recently loaded raw data version
        try:
            self.raw_version = RawDataVersion.objects.complete()[0]
        except IndexError:
            raise CommandError(
                'No raw CAL-ACCESS data loaded (run `python manage.py '
                'updatecalaccessrawdata`).'
            )
        # set up processed data directory
        self.processed_data_dir = os.path.join(
            get_download_directory(),
            'processed',
        )
        if not os.path.exists(self.processed_data_dir):
            os.makedirs(self.processed_data_dir)

        # get or create the ProcessedDataVersion instance
        self.processed_version, created = ProcessedDataVersion.objects.get_or_create(
            raw_version=self.raw_version,
        )
        # log if starting or resuming
        if created:
            self.header(
                'Processing {:%m-%d-%Y %H:%M:%S} snapshot'.format(
                    self.raw_version.release_datetime
                )
            )
        else:
            self.header(
                'Resuming processing of {:%m-%d-%Y %H:%M:%S} snapshot'.format(
                    self.raw_version.release_datetime
                )
            )
        # if there isn't already a process start datetime, set it
        if not self.processed_version.process_start_datetime:
            self.processed_version.process_start_datetime = now()
            self.processed_version.save()

        # get all of the models
        self.processed_models = get_models_to_process()

        # iterate over all of the processed models
        for m in self.processed_models:
            # set up the ProcessedDataFile instance
            processed_file, created = ProcessedDataFile.objects.get_or_create(
                version=self.processed_version,
                file_name=m._meta.model_name,
            )
            processed_file.process_start_datetime = now()
            processed_file.save()
            # flush the processed model
            if self.verbosity > 2:
                self.log(" Truncating %s" % m._meta.db_table)
            with connection.cursor() as c:
                c.execute('TRUNCATE TABLE "%s" CASCADE' % (m._meta.db_table))
            # load the processed model
            if self.verbosity > 2:
                self.log(" Loading raw data into %s" % m._meta.db_table)
            m.objects.load_raw_data()

            processed_file.records_count = m.objects.count()
            processed_file.process_finish_datetime = now()
            processed_file.save()

        self.processed_version.process_finish_datetime = now()
        self.processed_version.save()

        self.success("Done!")
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        # Set options
        self.file_name = options['file_name']
        self.data_dir = get_download_directory()
        self.tsv_dir = os.path.join(self.data_dir, "tsv/")
        self.csv_dir = os.path.join(self.data_dir, "csv/")
        self.log_dir = os.path.join(self.data_dir, "log/")

        if self.verbosity > 2:
            self.log(" Cleaning %s" % self.file_name)

        # Up the CSV data limit
        csv.field_size_limit(1000000000)

        # Input and output paths
        tsv_path = os.path.join(self.tsv_dir, self.file_name)
        csv_path = os.path.join(
            self.csv_dir,
            self.file_name.lower().replace("tsv", "csv")
        )

        # Reader
        tsv_file = open(tsv_path, 'rb')

        # Writer
        csv_file = open(csv_path, 'w')
        csv_writer = CSVKitWriter(csv_file, quoting=csv.QUOTE_ALL)

        # Pull and clean the headers
        try:
            headers = tsv_file.readline()
        except StopIteration:
            return
        headers = headers.decode("ascii", "replace")
        headers_csv = CSVKitReader(StringIO(headers), delimiter=str('\t'))
        try:
            headers_list = next(headers_csv)
        except StopIteration:
            return
        headers_count = len(headers_list)
        csv_writer.writerow(headers_list)

        log_rows = []

        # Loop through the rest of the data
        line_number = 1
        for tsv_line in tsv_file:

            # Goofing around with the encoding while we're in there.
            tsv_line = tsv_line.decode("ascii", "replace")
            if six.PY2:
                tsv_line = tsv_line.replace('\ufffd', '?')

            # Nuke any null bytes
            null_bytes = tsv_line.count('\x00')
            if null_bytes:
                tsv_line = tsv_line.replace('\x00', ' ')

            # Nuke ASCII 26 char, the "substitute character"
            # or chr(26) in python
            sub_char = tsv_line.count('\x1a')
            if sub_char:
                tsv_line = tsv_line.replace('\x1a', '')

            # Split on tabs so we can later spit it back out as CSV
            # and remove extra newlines while we are there.
            csv_field_list = tsv_line.replace("\r\n", "").split("\t")

            # Check if our values line up with our headers
            # and if not, see if CSVkit can sort out the problems
            if not len(csv_field_list) == headers_count:
                csv_field_list = next(CSVKitReader(
                    StringIO(tsv_line),
                    delimiter=str('\t')
                ))
                if not len(csv_field_list) == headers_count:
                    if self.verbosity > 2:
                        msg = '  Bad parse of line %s (%s headers, %s values)'
                        self.failure(msg % (
                            line_number,
                            len(headers_list),
                            len(csv_field_list)
                        ))
                    log_rows.append([
                        line_number,
                        len(headers_list),
                        len(csv_field_list),
                        ','.join(csv_field_list)
                    ])
                    continue

            # Write out the row
            csv_writer.writerow(csv_field_list)
            line_number += 1

        # Log errors if there are any
        if log_rows:
            if self.verbosity > 1:
                msg = '  %s errors'
                self.failure(msg % (len(log_rows) - 1))
            self.log_errors(log_rows)

        # Shut it down
        tsv_file.close()
        csv_file.close()

        # unless keeping files, remove tsv files
        if not options['keep_files']:
            os.remove(os.path.join(self.tsv_dir, options['file_name']))
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        # set / compute any attributes that multiple class methods need
        self.app_name = options["app_name"]
        self.keep_files = options["keep_files"]
        self.test_mode = options['test_data']
        self.downloading = options['download']
        self.cleaning = options['clean']
        self.loading = options['load']
        self.noinput = options['noinput']

        if self.test_mode:
            # and always keep files when running test data
            self.keep_files = True
            self.data_dir = get_test_download_directory()
            # need to set this app-wide because cleancalaccessrawfile
            #   also calls get_download_directory
            settings.CALACCESS_DOWNLOAD_DIR = self.data_dir
            self.noinput = True
        else:
            self.data_dir = get_download_directory()

        os.path.exists(self.data_dir) or os.makedirs(self.data_dir)
        self.zip_path = os.path.join(self.data_dir, 'calaccess.zip')
        self.tsv_dir = os.path.join(self.data_dir, "tsv/")

        # Immediately check that the tsv directory exists when using test data,
        #   so we can stop immediately.
        if self.test_mode:
            if not os.path.exists(self.tsv_dir):
                raise CommandError("Data tsv directory does not exist "
                                   "at %s" % self.tsv_dir)
            elif self.verbosity:
                self.log("Using test data")

        self.csv_dir = os.path.join(self.data_dir, "csv/")
        os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir)

        if self.test_mode:
            with open(self.data_dir + "/sampled_version.txt", "r") as f:
                current_release_datetime = f.readline()
                expected_size = f.readline()
        else:
            download_metadata = self.get_download_metadata()
            current_release_datetime = download_metadata['last-modified']
            expected_size = download_metadata['content-length']

        last_started_update = self.get_last_log()

        if self.test_mode:
            last_download = None
        else:
            try:
                last_download = self.command_logs.filter(
                    command='downloadcalaccessrawdata'
                ).order_by('-start_datetime')[0]
            except IndexError:
                last_download = None

        up_to_date = False
        can_resume = False

        # if there's a previously started update
        if last_started_update:
            # if current release datetime matches version of last started update
            if current_release_datetime == last_started_update.version.release_datetime:
                # if the last update finished
                if last_started_update.finish_datetime:
                    up_to_date = True
                else:
                    # if the last update didn't finish
                    # (but is still for the current version)
                    can_resume = True
            # if the last started update didn't finish
            elif not last_started_update.finish_datetime:
                # can resume update of old version as long as skipping download
                if not self.downloading:
                    can_resume = True
                # or if there is a last download
                elif last_download:
                    # and last download's version matches the outstanding update version
                    if last_download.version == last_started_update.version:
                        # and last download completed
                        if last_download.finish_datetime:
                            can_resume = True

        if self.noinput:
            # if not taking input and can resume, automatically go into resume mode
            self.resume_mode = can_resume
        else:
            prompt_context = dict(
                current_release_datetime=current_release_datetime,
                expected_size=size(expected_size),
                up_to_date=up_to_date,
                can_resume=can_resume,
            )

            last_finished_update = self.get_last_log(finished=True)

            if last_finished_update:
                loaded_v = last_finished_update.version
                prompt_context['since_loaded_version'] = naturaltime(loaded_v.release_datetime)
            else:
                prompt_context['since_loaded_version'] = None

            prompt = render_to_string(
                'calaccess_raw/updatecalaccessrawdata.txt',
                prompt_context,
            )

            if can_resume:
                if self.confirm_proceed(prompt):
                    self.resume_mode = True
                else:
                    self.resume_mode = False
                    if not self.confirm_proceed('Do you want re-start your update?\n'):
                        raise CommandError("Update cancelled")
            else:
                self.resume_mode = False
                if not self.confirm_proceed(prompt):
                    raise CommandError("Update cancelled")

        if self.resume_mode:
            self.log_record = last_started_update
        else:
            # get or create a version
            # .get_or_create() throws IntegrityError
            try:
                version = self.raw_data_versions.get(
                    release_datetime=current_release_datetime
                )
            except RawDataVersion.DoesNotExist:
                version = self.raw_data_versions.create(
                    release_datetime=current_release_datetime,
                    size=expected_size
                )
            # create a new log record
            self.log_record = self.command_logs.create(
                version=version,
                command=self,
                called_by=self.get_caller_log()
            )

        # if the user could have resumed but didn't
        force_restart_download = can_resume and not self.resume_mode

        # if not skipping download, and there's a previous download
        if self.downloading and last_download:
            # if not forcing a restart
            if not force_restart_download:
                # check if version we are updating is last one being downloaded
                if self.log_record.version == last_download.version:
                    # if it finished
                    if last_download.finish_datetime:
                        self.log('Already downloaded.')
                        self.downloading = False

        if self.downloading:
            if self.test_mode:
                call_command(
                    "downloadcalaccessrawdatatest",
                    verbosity=self.verbosity,
                )
            else:
                call_command(
                    "downloadcalaccessrawdata",
                    keep_files=self.keep_files,
                    verbosity=self.verbosity,
                    noinput=True,
                    restart=force_restart_download,
                )
            if self.verbosity:
                self.duration()

        # execute the other steps that haven't been skipped
        if options['clean']:
            self.clean()
            if self.verbosity:
                self.duration()

        if options['load']:
            self.load()
            if self.verbosity:
                self.duration()

        if self.verbosity:
            self.success("Done!")

        self.log_record.finish_datetime = now()
        self.log_record.save()
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        # get the dir where data goes from app settings
        self.data_dir = get_download_directory()
        # if data_dir doesn't exist, create it
        os.path.exists(self.data_dir) or os.makedirs(self.data_dir)

        # downloaded zip file will go in data_dir
        self.zip_path = os.path.join(self.data_dir, self.url.split('/')[-1])
        # raw tsv files go in same data_dir in tsv/
        self.tsv_dir = os.path.join(self.data_dir, "tsv/")

        download_metadata = self.get_download_metadata()

        self.current_release_datetime = download_metadata['last-modified']
        self.current_release_size = download_metadata['content-length']

        self.last_started_download = self.get_last_log()
        self.last_finished_download = self.get_last_log(finished=True)

        if self.last_finished_download:
            last_release_datetime = self.last_finished_download.version.release_datetime
            since_prev_version = naturaltime(last_release_datetime)
        else:
            last_release_datetime = None
            since_prev_version = None

        if last_release_datetime == self.current_release_datetime:
            already_downloaded = True
        else:
            already_downloaded = False

        # can resume only if possible and not forcing re-start
        self.resume_download = self.check_can_resume() and not options['restart']

        if self.resume_download:
            # set current size to partially downloaded zip
            self.local_file_size = os.path.getsize(self.zip_path)
            # set the datetime of last download to last modified date
            # of zip file
            timestamp = os.path.getmtime(self.zip_path)
            self.local_file_datetime = datetime.fromtimestamp(timestamp, utc)
        else:
            self.local_file_size = 0
            self.local_file_datetime = None

        if not options['noinput'] and not options['restart']:

            # setting up the prompt
            prompt_context = dict(
                current_release_datetime=self.current_release_datetime,
                resuming=self.resume_download,
                already_downloaded=already_downloaded,
                expected_size=size(self.current_release_size),
                local_file_size=size(self.local_file_size),
                download_dir=self.data_dir,
                since_prev_version=since_prev_version,
                since_local_file_modified=naturaltime(self.local_file_datetime)
            )

            prompt = render_to_string(
                'calaccess_raw/downloadcalaccessrawdata.txt',
                prompt_context,
            )

            if not self.confirm_proceed(prompt):
                raise CommandError("Download cancelled")

        if self.resume_download:
            self.log_record = self.last_started_download
            self.version = self.log_record.version
        else:
            # get or create a version record
            # .get_or_create() throws IntegrityError
            try:
                self.version = self.raw_data_versions.get(
                    release_datetime=self.current_release_datetime
                )
            except RawDataVersion.DoesNotExist:
                self.version = self.raw_data_versions.create(
                    release_datetime=self.current_release_datetime,
                    size=download_metadata['content-length']
                )
            # create a log record
            self.log_record = self.command_logs.create(
                version=self.version,
                command=self,
                called_by=self.get_caller_log()
            )

        self.download()
        self.unzip()
        self.prep()
        self.track_files()

        if getattr(settings, 'CALACCESS_STORE_ARCHIVE', False):
            self.archive()

        if not options['keep_files']:
            os.remove(self.zip_path)
            shutil.rmtree(os.path.join(self.data_dir, 'CalAccess'))

        self.log_record.finish_datetime = now()
        self.log_record.save()
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        # get the dir where data goes from app settings
        self.data_dir = get_download_directory()
        # if data_dir doesn't exist, create it
        os.path.exists(self.data_dir) or os.makedirs(self.data_dir)

        # downloaded zip file will go in data_dir
        self.zip_path = os.path.join(self.data_dir, 'calaccess.zip')
        # raw tsv files go in same data_dir in tsv/
        self.tsv_dir = os.path.join(self.data_dir, "tsv/")

        download_metadata = self.get_download_metadata()

        self.current_release_datetime = download_metadata['last-modified']
        self.current_release_size = download_metadata['content-length']

        self.last_started_download = self.get_last_log()
        self.last_finished_download = self.get_last_log(finished=True)

        if self.last_finished_download:
            last_release_datetime = self.last_finished_download.version.release_datetime
            since_prev_version = naturaltime(last_release_datetime)
        else:
            last_release_datetime = None
            since_prev_version = None

        if last_release_datetime == self.current_release_datetime:
            already_downloaded = True
        else:
            already_downloaded = False

        # can resume only if possible and not forcing re-start
        self.resume_download = self.check_can_resume(
        ) and not options['restart']

        if self.resume_download:
            # set current size to partially downloaded zip
            self.local_file_size = os.path.getsize(self.zip_path)
            # set the datetime of last download to last modified date
            # of zip file
            timestamp = os.path.getmtime(self.zip_path)
            self.local_file_datetime = datetime.fromtimestamp(timestamp, utc)
        else:
            self.local_file_size = 0
            self.local_file_datetime = None

        if not options['noinput'] and not options['restart']:

            # setting up the prompt
            prompt_context = dict(
                current_release_datetime=self.current_release_datetime,
                resuming=self.resume_download,
                already_downloaded=already_downloaded,
                expected_size=size(self.current_release_size),
                local_file_size=size(self.local_file_size),
                download_dir=self.data_dir,
                since_prev_version=since_prev_version,
                since_local_file_modified=naturaltime(
                    self.local_file_datetime))

            prompt = render_to_string(
                'calaccess_raw/downloadcalaccessrawdata.txt',
                prompt_context,
            )

            if not self.confirm_proceed(prompt):
                raise CommandError("Download cancelled")

        if self.resume_download:
            self.log_record = self.last_started_download
        else:
            # get or create a version record
            # .get_or_create() throws IntegrityError
            try:
                version = self.raw_data_versions.get(
                    release_datetime=self.current_release_datetime)
            except RawDataVersion.DoesNotExist:
                version = self.raw_data_versions.create(
                    release_datetime=self.current_release_datetime,
                    size=download_metadata['content-length'])
            # create a log record
            self.log_record = self.command_logs.create(
                version=version, command=self, called_by=self.get_caller_log())

        self.download()
        self.unzip()

        if not options['keep_files']:
            os.remove(self.zip_path)

        self.prep()

        if not options['keep_files']:
            shutil.rmtree(os.path.join(self.data_dir, 'CalAccess'))

        self.log_record.finish_datetime = datetime.now()
        self.log_record.save()
Exemplo n.º 40
0
    def set_options(self, *args, **kwargs):
        self.url = 'http://campaignfinance.cdn.sos.ca.gov/dbwebexport.zip'
        self.verbosity = int(kwargs['verbosity'])

        if kwargs['test_data']:
            self.data_dir = get_test_download_directory()
            settings.CALACCESS_DOWNLOAD_DIR = self.data_dir
        else:
            self.data_dir = get_download_directory()

        os.path.exists(self.data_dir) or os.makedirs(self.data_dir)
        self.zip_path = os.path.join(self.data_dir, 'calaccess.zip')
        self.tsv_dir = os.path.join(self.data_dir, "tsv/")

        # Immediately check that the tsv directory exists when using test data,
        #   so we can stop immediately.
        if kwargs['test_data']:
            if not os.path.exists(self.tsv_dir):
                raise CommandError("Data tsv directory does not exist "
                                   "at %s" % self.tsv_dir)
            elif self.verbosity:
                self.log("Using test data")

        self.csv_dir = os.path.join(self.data_dir, "csv/")
        os.path.exists(self.csv_dir) or os.makedirs(self.csv_dir)
        if kwargs['download']:
            self.download_metadata = self.get_download_metadata()
            self.local_metadata = self.get_local_metadata()

            total_size = self.download_metadata['content-length']
            last_modified = self.download_metadata['last-modified']
            last_download = self.local_metadata['last-download']
            cur_size = 0

            self.resume_download = (kwargs['resume-download'] and
                                    os.path.exists(self.zip_path)) 

            if self.resume_download:
                # Make sure the downloaded chunk is newer than the
                # last update to the remote data.
                timestamp = os.path.getmtime(self.zip_path)
                chunk_datetime = datetime.fromtimestamp(timestamp, utc)
                self.resume_download = chunk_datetime > last_modified
                if self.resume_download:
                    last_download = chunk_datetime
                    cur_size = os.path.getsize(self.zip_path)

            prompt_context = dict(
                resuming=self.resume_download,
                already_downloaded=last_modified==last_download,
                last_modified=last_modified,
                last_download=last_download,
                time_ago=naturaltime(last_download),
                total_size=size(total_size),
                cur_size=size(cur_size),
                download_dir=self.data_dir,
            )

            self.prompt = render_to_string(
                'calaccess_raw/downloadcalaccessrawdata.txt',
                prompt_context,
            )