def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} text_type = agate.Text(**type_kwargs) if self.args.no_inference: types = [text_type] else: number_type = agate.Number(locale=self.args.locale, **type_kwargs) # See the order in the `agate.TypeTester` class. types = [ agate.Boolean(**type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), text_type, ] # In order to parse dates like "20010101". if self.args.date_format or self.args.datetime_format: types.insert(-1, number_type) else: types.insert(1, number_type) return agate.TypeTester(types=types)
def get_column_types(self): if getattr(self.args, 'blanks', None): text_type = agate.Text(cast_nulls=False) else: text_type = agate.Text() if self.args.no_inference: return agate.TypeTester(types=[text_type]) else: return agate.TypeTester(types=[ agate.Boolean(), agate.Number(locale=self.args.locale), agate.TimeDelta(), agate.Date(date_format=self.args.date_format), agate.DateTime(datetime_format=self.args.datetime_format), text_type ])
def load_data(data): tester = agate.TypeTester(types=[ boolean, number, text ]) data['dpi'] = agate.Table.from_csv('DPI2015_basefile.v5.csv', column_types=tester)
def test_load(self): tester = agate.TypeTester( force={ 'last_name': agate.Text(), 'first_name': agate.Text(), 'age': agate.Number() }) exonerations = agate.Table.from_csv( '../../../data/exonerations-20150828.csv', column_types=tester) print(exonerations) # 表的描述
def make_type_tester(meta): """ Uses parsed lookup table metadata to create a :class:`.agate.TypeTester` that will always use correct types for the table columns. (And avoid the overhead of type inference.) """ force = {} for k, v in meta['columns'].items(): force[k] = getattr(agate, v['type'])() return agate.TypeTester(force=force)
def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} types = [agate.Text(**type_kwargs)] if not self.args.no_inference: types = [ agate.Boolean(**type_kwargs), agate.Number(locale=self.args.locale, **type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), ] + types return agate.TypeTester(types=types)
def build_type_tester(text_columns: Iterable[str]) -> agate.TypeTester: types = [ agate.data_types.Number(null_values=('null', '')), agate.data_types.Date(null_values=('null', ''), date_format='%Y-%m-%d'), agate.data_types.DateTime(null_values=('null', ''), datetime_format='%Y-%m-%d %H:%M:%S'), ISODateTime(null_values=('null', '')), agate.data_types.Boolean(true_values=('true',), false_values=('false',), null_values=('null', '')), agate.data_types.Text(null_values=('null', '')) ] force = { k: agate.data_types.Text(null_values=('null', '')) for k in text_columns } return agate.TypeTester(force=force, types=types)
def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} types = [agate.Text(**type_kwargs)] if not self.args.no_inference: types = [ agate.Boolean(**type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), # This is a different order than agate's default, in order to parse dates like "20010101". agate.Number(locale=self.args.locale, **type_kwargs), ] + types return agate.TypeTester(types=types)
def extract_xlsx_to_sql(filename, sqlurl): workbook = openpyxl.load_workbook(filename, read_only=True, data_only=True) sheetnames = workbook.sheetnames # Agate makes an educated guess at data types, but sometimes it # guesses wrong. This is especially possible when dealing with # multiple data files like we are in this project. We're able # to override the type inference system here # # Many columns are sparsely populated: mostly blank, with maybe # a 1 or a 0 once in a while. Agate will assume those are boolean # columns, but once in a while there'll be a number. # # Since the data is spread across so many input files, we are ignoring # the type testing capabilities all together and forcing everything # to Text() type_tester = agate.TypeTester(types=[agate.Text()]) for sheetname in sheetnames: print("%s :: %s" % (filename, sheetname)) start = datetime.now() t = agate.Table.from_xlsx(filename, sheet=sheetname, column_types=type_tester) # we create a duplicate table with lowercase column names because # uppercase names require quoting in resulting SQL statements # and we lowercase the SQL table name for the same reason in the following # line t = t.rename(column_names=[name.lower() for name in t.column_names]) t.to_sql( sqlurl, sheetname.lower(), constraints=False, create=True, create_if_not_exists=True, # chunk_size=1, ) delta = datetime.now() - start print("%s :: %s :: %d elapsed" % (filename, sheetname, delta.seconds))
def get_column_types(self): if self.args.no_inference: return agate.TypeTester(limit=0) else: return None
import agate DEFAULT_TYPE_TESTER = agate.TypeTester(types=[ agate.data_types.Number(null_values=('null', '')), agate.data_types.TimeDelta(null_values=('null', '')), agate.data_types.Date(null_values=('null', '')), agate.data_types.DateTime(null_values=('null', '')), agate.data_types.Boolean(true_values=('true', ), false_values=('false', ), null_values=('null', '')), agate.data_types.Text(null_values=('null', '')) ]) def table_from_data(data, column_names): "Convert list of dictionaries into an Agate table" # The agate table is generated from a list of dicts, so the column order # from `data` is not preserved. We can use `select` to reorder the columns # # If there is no data, create an empty table with the specified columns if len(data) == 0: return agate.Table([], column_names=column_names) else: table = agate.Table.from_object(data, column_types=DEFAULT_TYPE_TESTER) return table.select(column_names) def empty_table(): "Returns an empty Agate table. To be used in place of None"
#!/usr/bin/env python import agate tester = agate.TypeTester(force={'fips': agate.Text()}) table = agate.Table.from_csv('examples/realdata/ks_1033_data.csv', column_types=tester) # Question 1: What was the total cost to Kansas City area counties? # Filter to counties containing Kansas City kansas_city = table.where(lambda r: r['county'] in ('JACKSON', 'CLAY', 'CASS', 'PLATTE')) # Sum total_cost of four counties print('Total for Kansas City area: %i' % kansas_city.columns['total_cost'].aggregate(agate.Sum())) # Question 2: Which counties spent the most? # Group by counties counties = table.group_by('county') # Aggregate totals for all counties totals = counties.aggregate([('total_cost', agate.Sum(), 'total_cost_sum')]) totals = totals.order_by('total_cost_sum', reverse=True) totals.limit(20).print_bars('county', 'total_cost_sum', width=80) print('Five most spendy counties:')
import agate import agatecharts agatecharts.patch() OUTPUT_DIR = 'docs/samples' if not os.path.exists(OUTPUT_DIR): os.mkdir(OUTPUT_DIR) for filename in os.listdir(OUTPUT_DIR): os.remove(os.path.join(OUTPUT_DIR, filename)) tester = agate.TypeTester(force={ ' Date': agate.Date('%Y-%m-%d') }) emissions = agate.Table.from_csv('examples/epa-emissions-20150910.csv', tester) emissions = emissions.compute([ (agate.Formula(agate.Number(), lambda r: r[' Date'].day), 'day'), (agate.Formula(agate.Number(), lambda r: r[' SO2 (tons)'] or 0), 'so2'), (agate.Formula(agate.Number(), lambda r: r[' NOx (tons)'] or 0), 'noX'), (agate.Formula(agate.Number(), lambda r: r[' CO2 (short tons)'] or 0), 'co2') ]) states = emissions.group_by('State') state_totals = states.aggregate([ ('so2', agate.Sum(), 'so2'), ('co2', agate.Sum(), 'co2'),
import agate, os, itertools, time, datetime, glob, csv from datetime import date text_type = agate.Text() datetime_type = agate.DateTime() tester = agate.TypeTester(force={'contb_receipt_dt': agate.Text()}) today = date.today() datestamp = str(today.year) + str(today.month) + str(today.day) ky_candidates_file = str( glob.glob('data/csv/process/*ky-candidate-contributions.csv')[0]) ky_candidate_contributions = agate.Table.from_csv(ky_candidates_file, column_types=tester) current_candidate_cmte_ids = ['C00580100', 'C00575795'] #Trump, Donald J. = C00580100 #Sanders, Bernard = C00577130 #Kasich, John R. = C00581876 #Clinton, Hillary Rodham = C00575795 #Cruz, Rafael Edward 'Ted' = C00574624 def candidate_brackets(contributions): #brackets #bracket1 = 200 and under #bracket2 = 200.01 - 499.99 #bracket3 = 500 - 999.99
import agate DEFAULT_TYPE_TESTER = agate.TypeTester(types=[ agate.data_types.Number(), agate.data_types.Date(), agate.data_types.DateTime(), agate.data_types.Boolean(), agate.data_types.Text() ]) def table_from_data(data): "Convert list of dictionaries into an Agate table" return agate.Table.from_object(data, column_types=DEFAULT_TYPE_TESTER) def empty_table(): "Returns an empty Agate table. To be used in place of None" return agate.Table(rows=[]) def as_matrix(table): "Return an agate table as a matrix of data sans columns" return [r.values() for r in table.rows.values()] def from_csv(abspath):
def main(self): # Determine the file type. if self.args.filetype: filetype = self.args.filetype if filetype not in SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error( 'You must specify a format when providing data via STDIN (pipe).' ) filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error( 'Unable to automatically determine the format of the input file. Try specifying a format with --format.' ) self.buffers_input = filetype == 'csv' or not self.args.no_inference # Set the input file. if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) # Set the reader's arguments. kwargs = {} if self.args.schema: schema = self._open_input_file(self.args.schema) elif filetype == 'fixed': raise ValueError('schema must not be null when format is "fixed"') if self.args.sheet: kwargs['sheet'] = self.args.sheet if filetype == 'csv': kwargs.update(self.reader_kwargs) # Streaming CSV musn't set sniff_limit, but non-streaming should. if not self.args.no_inference: kwargs['sniff_limit'] = self.args.sniff_limit if self.args.no_header_row: kwargs['header'] = False elif self.args.no_inference: # Streaming CSV musn't set column_types, but other formats should. kwargs['column_types'] = agate.TypeTester(limit=0) # Convert the file. if filetype == 'csv' and self.args.no_inference: reader = agate.csv.reader(self.input_file, **self.reader_kwargs) writer = agate.csv.writer(self.output_file, **self.writer_kwargs) writer.writerows(reader) elif filetype == 'fixed': self.output_file.write( fixed2csv(self.input_file, schema, output=self.output_file, **kwargs)) elif filetype == 'geojson': self.output_file.write(geojson2csv(self.input_file, **kwargs)) elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'): if filetype == 'csv': table = agate.Table.from_csv(self.input_file, **kwargs) elif filetype == 'json': table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs) elif filetype == 'ndjson': table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs) elif filetype == 'xls': table = agate.Table.from_xls(self.input_file, sheet=kwargs.get('sheet')) elif filetype == 'xlsx': table = agate.Table.from_xlsx(self.input_file, sheet=kwargs.get('sheet')) elif filetype == 'dbf': if not hasattr(self.input_file, 'name'): raise ValueError( 'DBF files can not be converted from stdin. You must pass a filename.' ) table = agate.Table.from_dbf(self.input_file.name, **kwargs) table.to_csv(self.output_file) self.input_file.close() if self.args.schema: schema.close()
def DumbTypeTester(): # We get agate's normal possible types types = agate.TypeTester()._possible_types for i, t in enumerate(types): types[i] = t.__class__(null_values=('',)) return agate.TypeTester(types=types)
import agate tester = agate.TypeTester( force={ 'Physician_First_Name': agate.Text(), 'Physician_Last_Name': agate.Text(), 'Recipient_Primary_Business_Street_Address_Line1': agate.Text(), 'Recipient_City': agate.Text(), 'Recipient_Zip_Code': agate.Text(), 'Physician_Specialty': agate.Text(), 'Physician_Profile_ID': agate.Number(), 'Total_Amount_of_Payment_USDollars': agate.Number(), 'General': agate.Number(), 'Research': agate.Number() }) column_renames = { 'Physician_First_Name': 'fn', 'Physician_Last_Name': 'ln', 'Recipient_Primary_Business_Street_Address_Line1': 'add', 'Recipient_City': 'city', 'Recipient_Zip_Code': 'zip', 'Physician_Specialty': 'spec', 'Physician_Profile_ID': 'id', 'Total_Amount_of_Payment_USDollars': 'd', 'General': 'g', 'Research': 'r' } table = agate.Table.from_csv( 'edits/payments/most-paid-02-trim.csv',