def subset(data): subset = data['table'].where( lambda r: r['origin'] in SELECTED_COUNTRIES and r['year'] >= 1980) groups = subset.group_by(lambda r: '/'.join([str(r['year']), r['origin']]), key_name='year_and_origin') refugees = groups.aggregate([ ('refugees', agate.Sum('refugees')), ('asylum_seekers', agate.Sum('asylum_seekers')), ('returned_refugees', agate.Sum('returned_refugees')), ('idps', agate.Sum('idps')), ('returned_idps', agate.Sum('returned_idps')), ('stateless_persons', agate.Sum('stateless_persons')), ('others', agate.Sum('others')), ('total', agate.Sum('total')) ]).order_by('year_and_origin', reverse=True) refugees = refugees.compute([ ('year', agate.Formula(agate.Text(), lambda r: r['year_and_origin'].split('/')[0])), ('origin', agate.Formula(agate.Text(), lambda r: r['year_and_origin'].split('/')[1])) ]) refugees = refugees.select([ 'origin', 'year', 'refugees', 'asylum_seekers', 'idps', 'returned_idps', 'stateless_persons', 'others', 'total' ]) refugees.to_csv('subset.csv') refugees.pivot( 'year', 'origin', agate.Sum('total')).order_by('year').to_csv('subset_pivot.csv')
def test_load(self): tester = agate.TypeTester( force={ 'last_name': agate.Text(), 'first_name': agate.Text(), 'age': agate.Number() }) exonerations = agate.Table.from_csv( '../../../data/exonerations-20150828.csv', column_types=tester) print(exonerations) # 表的描述
def get_types(example_row): types = [] for v in example_row: value_type = xlrd.sheet.ctype_text[v.ctype] if value_type == 'text': types.append(agate.Text()) elif value_type == 'number': types.append(agate.Number()) elif value_type == 'xldate': types.append(agate.Date()) else: types.append(agate.Text()) return types
def test_distinct_values(self): column_names: List = [ 'id', 'name', 'dob', 'last seen', 'size', 'active', ] column_types: List = [ agate.Number(), agate.Text(), agate.Date(), agate.DateTime(), agate.Text(), agate.Boolean(), ] rows = [(1, 'Alvin Cotton', '03-01-1980', '06-30-2019 12:12:00', 'L', True), (2, 'Usmaan Rojas', '01-12-1978', '06-30-2019 12:12:00', 'S', False), (3, 'Kingston Odling', '04-09-1990', '06-30-2019 12:12:00', 'M', True), (3, 'Pooja Gillespie', '10-07-1985', '06-30-2019 12:12:00', 'S', True), (4, 'Hal Blake', '08-17-1989', '06-30-2019 12:12:00', 'L', True), (5, 'Shannen Blevins', '06-10-1981', '06-30-2019 12:12:00', 'M', False), (5, 'Courteney Weston', '04-23-1992', '06-30-2019 12:12:00', 'M', False), (6, 'Conner Calhoun', '05-16-1977', '06-30-2019 12:12:00', 'XL', True), (7, 'Susie Rasmussen', '02-08-1987', '06-30-2019 12:12:00', 'L', False), (8, 'Cassie Beltran', '12-15-1982', '06-30-2019 12:12:00', 'M', True)] model = csvhound.core.BaseHound() table = model.get_table_from_file('sample-data/test-distinct.csv') distinct = model.distinct_values('size') agate_table = agate.Table(rows, column_names, column_types) distinct_agate = agate_table.select('size').distinct('size') # now do the testing self.assertColumnNames(distinct, ('size', )) self.assertColumnTypes(distinct, [type(c) for c in distinct.column_types]) self.assertRows(distinct, distinct_agate)
def test_create_if_not_exists(self): column_names = ['id', 'name'] column_types = [agate.Number(), agate.Text()] rows1 = ( (1, 'Jake'), (2, 'Howard'), ) rows2 = ( (3, 'Liz'), (4, 'Tim'), ) table1 = agate.Table(rows1, column_names, column_types) table2 = agate.Table(rows2, column_names, column_types) engine = create_engine(self.connection_string) connection = engine.connect() # Write two agate tables into the same SQL table table1.to_sql(connection, 'create_if_not_exists_test', create=True, create_if_not_exists=True, insert=True) table2.to_sql(connection, 'create_if_not_exists_test', create=True, create_if_not_exists=True, insert=True)
def setUp(self): self.rows = ( (1.123, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'), (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'), (2, u'c', False, '11/5/2015', '11/4/2015 12:45 PM'), (None, 'b', None, None, None), ) self.column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', ] self.column_types = [ agate.Number(), agate.Text(), agate.Boolean(), agate.Date(), agate.DateTime(), ] self.table = agate.Table(self.rows, self.column_names, self.column_types) self.connection_string = 'sqlite:///:memory:'
def load_data(data): text_type = agate.Text() number_type = agate.Number() boolean_type = agate.Boolean() columns = ( ('last_name', text_type), ('first_name', text_type), ('age', number_type), ('race', text_type), ('state', text_type), ('tags', text_type), ('crime', text_type), ('sentence', text_type), ('convicted', number_type), ('exonerated', number_type), ('dna', boolean_type), ('dna_essential', text_type), ('mistaken_witness', boolean_type), ('false_confession', boolean_type), ('perjury', boolean_type), ('false_evidence', boolean_type), ('official_misconduct', boolean_type), ('inadequate_defense', boolean_type), ) with open('examples/realdata/exonerations-20150828.csv') as f: # Create a csv reader reader = csv.reader(f) # Skip header next(f) # Create the table data['exonerations'] = agate.Table(reader, columns)
def load_year_killed_data(year): specified_types = { 'killed': agate.Number(), 'injured': agate.Number(), 'date_hour': agate.Text() } return agate.Table.from_url('https://s3.amazonaws.com/traffic-sd/accidents_killed_{}.csv'.format(year), column_types=specified_types)
def setUp(self): text_type = agate.Text() number_type = agate.Number() columns = ( ('gender', text_type), ('month', number_type), ('median', number_type), ('stdev', number_type), ('1st', number_type), ('3rd', number_type), ('5th', number_type), ('15th', number_type), ('25th', number_type), ('50th', number_type), ('75th', number_type), ('85th', number_type), ('95th', number_type), ('97th', number_type), ('99th', number_type) ) with open('examples/heights.csv') as f: # Create a csv reader reader = csv.reader(f) # Skip header next(f) # Create the table self.table = agate.Table(reader, columns) if os.path.exists(TEST_FILENAME): os.remove(TEST_FILENAME)
def add_full_hour_date(data): data['table'] = data['table'].compute([ ('date_hour', agate.Formula( agate.Text(), lambda r: r['date_time'].strftime("%Y-%m-%d %H:00:00"))), ])
def load_data(data): """ Load the dataset. """ text_type = agate.Text() number_type = agate.Number() columns = OrderedDict([ ('year', number_type), ('residence', text_type), ('origin', text_type), ('refugees', number_type), ('asylum_seekers', number_type), ('returned_refugees', number_type), ('idps', number_type), ('returned_idps', number_type), ('stateless_persons', number_type), ('others', number_type), ('total', number_type), ]) # Load the data with open('unhcr_popstats_export_persons_of_concern_2016_01_12_192533.csv' ) as f: reader = csvkit.reader(f) next(reader) rows = [] for row in reader: rows.append([None if d == '*' else d for d in row]) data['table'] = agate.Table(rows, columns.keys(), columns.values())
def test_create_if_not_exists(self): column_names = ['id', 'name'] column_types = [agate.Number(), agate.Text()] rows1 = ( (1, 'Jake'), (2, 'Howard'), ) rows2 = ( (3, 'Liz'), (4, 'Tim'), ) table1 = agate.Table(rows1, column_names, column_types) table2 = agate.Table(rows2, column_names, column_types) engine = create_engine(self.connection_string) connection = engine.connect() # Write two agate tables into the same SQL table table1.to_sql(connection, 'create_if_not_exists_test', create=True, create_if_not_exists=True, insert=True) table2.to_sql(connection, 'create_if_not_exists_test', create=True, create_if_not_exists=True, insert=True) table = agate.Table.from_sql(connection, 'create_if_not_exists_test') self.assertSequenceEqual(table.column_names, column_names) self.assertIsInstance(table.column_types[0], agate.Number) self.assertIsInstance(table.column_types[1], agate.Text) self.assertEqual(len(table.rows), len(table1.rows) + len(table1.rows)) self.assertSequenceEqual(table.rows[0], table1.rows[0])
def setUp(self): self.rows = ( (1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'), (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'), (None, 'b', None, None, None), ) self.column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', ] self.user_provided_column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', ] self.column_types = [ agate.Number(), agate.Text(), agate.Boolean(), agate.Date(), agate.DateTime(), ] self.table = agate.Table(self.rows, self.column_names, self.column_types)
def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} text_type = agate.Text(**type_kwargs) if self.args.no_inference: types = [text_type] else: number_type = agate.Number(locale=self.args.locale, **type_kwargs) # See the order in the `agate.TypeTester` class. types = [ agate.Boolean(**type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), text_type, ] # In order to parse dates like "20010101". if self.args.date_format or self.args.datetime_format: types.insert(-1, number_type) else: types.insert(1, number_type) return agate.TypeTester(types=types)
def from_sql(cls, connection_or_string, table_name): """ Create a new :class:`agate.Table` from a given SQL table. Types will be inferred from the database schema. Monkey patched as class method :meth:`Table.from_sql`. :param connection_or_string: An existing sqlalchemy connection or connection string. :param table_name: The name of a table in the referenced database. """ engine, connection = get_engine_and_connection(connection_or_string) metadata = MetaData(connection) sql_table = Table(table_name, metadata, autoload=True, autoload_with=connection) column_names = [] column_types = [] for sql_column in sql_table.columns: column_names.append(sql_column.name) if type(sql_column.type) in INTERVAL_MAP.values(): py_type = datetime.timedelta else: py_type = sql_column.type.python_type if py_type in [int, float, decimal.Decimal]: if py_type is float: sql_column.type.asdecimal = True column_types.append(agate.Number()) elif py_type is bool: column_types.append(agate.Boolean()) elif issubclass(py_type, six.string_types): column_types.append(agate.Text()) elif py_type is datetime.date: column_types.append(agate.Date()) elif py_type is datetime.datetime: column_types.append(agate.DateTime()) elif py_type is datetime.timedelta: column_types.append(agate.TimeDelta()) else: raise ValueError('Unsupported sqlalchemy column type: %s' % type(sql_column.type)) s = select([sql_table]) rows = connection.execute(s) try: return agate.Table(rows, column_names, column_types) finally: if engine is not None: connection.close() engine.dispose()
def get_column_types(self): if getattr(self.args, 'blanks', None): text_type = agate.Text(cast_nulls=False) else: text_type = agate.Text() if self.args.no_inference: return agate.TypeTester(types=[text_type]) else: return agate.TypeTester(types=[ agate.Boolean(), agate.Number(locale=self.args.locale), agate.TimeDelta(), agate.Date(date_format=self.args.date_format), agate.DateTime(datetime_format=self.args.datetime_format), text_type ])
def test_lookup_multiple_keys(self): rows = (('AZ', '1985'), ('WY', '2014'), ('SC', '1994')) column_names = ['usps', 'year'] column_types = [agate.Text(), agate.Text()] table = agate.Table(rows, column_names, column_types) result = table.lookup(['usps', 'year'], 'population', source=self._source) self.assertColumnNames(result, ['usps', 'year', 'population']) self.assertColumnTypes(result, [agate.Text, agate.Text, agate.Number]) self.assertSequenceEqual(result.rows[1].values(), ['WY', '2014', 584153])
def _analyze_date(self, event): table = self.table.where( lambda row: row["fields/date"] is not None).compute([ ( "reduce_to_date", agate.Formula( agate.Text(), lambda row: helpers.reduce_to_date(row["fields/date"]), ), ), ( "reduce_to_year", agate.Formula( agate.Number(), lambda row: helpers.reduce_to_year(row["fields/date"]), ), ), ( "reduce_to_time", agate.Formula( agate.Number(), lambda row: helpers.reduce_to_time(row["fields/date"]), ), ), ]) years = table.distinct( "reduce_to_year").columns["reduce_to_year"].values() _data = {} for year in years: _data[year] = (table.where(lambda row: row[ "reduce_to_year"] == year).select("reduce_to_date").pivot( "reduce_to_date").order_by("reduce_to_date")) event.set() print(f"\n\n{helpers.h1_icn} Date\n") for year in years: data_keys = list(_data[year].columns["reduce_to_date"].values()) _counts = list( map(int, list(_data[year].columns["Count"].values()))) _sum = sum(_counts) data_count = [[i] for i in _counts] args = { "color": False, "custom_tick": False, "start_dt": f"{year}-01-01" } print(f"\n{helpers.h2_icn} Year {year} ({_sum:,} emails)\n") calendar_heatmap(data=data_count, args=args, labels=data_keys)
def test_make_sql_table_min_col_len(self): rows = ((1, 'x' * 10), (2, '')) column_names = ['id', 'name'] column_types = [agate.Number(), agate.Text()] table = agate.Table(rows, column_names, column_types) sql_table = agatesql.table.make_sql_table(table, 'test_table', dialect='mysql', db_schema='test_schema', constraints=True, min_col_len=20) self.assertEquals(sql_table.columns.get('name').type.length, 20)
def test_to_sql_create_statement_wide_width(self): rows = ((1, 'x' * 21845), (2, '')) column_names = ['id', 'name'] column_types = [agate.Number(), agate.Text()] table = agate.Table(rows, column_names, column_types) statement = table.to_sql_create_statement('test_table', db_schema='test_schema', dialect='mysql') self.assertEqual(statement.replace('\t', ' '), '''CREATE TABLE test_schema.test_table ( id DECIMAL(38, 0) NOT NULL, name TEXT );''') # noqa
def test_lookup_require_match(self): rows = (('WA', ), ('VA', ), ('FA', )) column_names = ['usps'] column_types = [agate.Text()] table = agate.Table(rows, column_names, column_types) with self.assertRaises(ValueError): result = table.lookup('usps', 'state', require_match=True, source=self._source)
def test_lookup_no_match(self): rows = (('WA', ), ('VA', ), ('FA', )) column_names = ['usps'] column_types = [agate.Text()] table = agate.Table(rows, column_names, column_types) result = table.lookup('usps', 'state', source=self._source) self.assertColumnNames(result, ['usps', 'state']) self.assertColumnTypes(result, [agate.Text, agate.Text]) self.assertSequenceEqual(result.rows[2].values(), ['FA', None])
def main(): df = pd.read_csv(SRC_PATH, dtype=str) counts = df['date'].value_counts().sort_index() # just get first 10 rows and last 10 rows counts = pd.concat([ counts.head(10), pd.Series({'...': None}, name='date'), counts.tail(10) ]) vals = [ [k, v] for k, v in counts.to_dict().items() ] # is there really no way to convert a Pandas series to list-of-lists? table = agate.Table(vals, ['date', 'count'], [agate.Text(), agate.Number()]) table.print_bars('date', 'count')
def test_grouping(self): exonerations = agate.Table.from_csv( '../../../data/exonerations-20150828.csv') clean_state_data = exonerations.compute( [('federal', agate.Formula(agate.Boolean(), lambda row: row['state'].startswith('F-'))), ('state', agate.Formula( agate.Text(), lambda row: row['state'][2:] if row['state'].startswith('F-') else row['state']))], replace=True) by_state = clean_state_data.group_by('state') state_totals = by_state.aggregate([('count', agate.Count())]) sorted_totals = state_totals.order_by('count', reverse=True) sorted_totals.print_table(max_rows=10)
def test_lookup_key(self): rows = (('WA', ), ('VA', ), ('TX', )) column_names = ['postal'] column_types = [agate.Text()] table = agate.Table(rows, column_names, column_types) result = table.lookup('postal', 'state', lookup_key='usps', source=self._source) self.assertColumnNames(result, ['postal', 'state']) self.assertColumnTypes(result, [agate.Text, agate.Text]) self.assertSequenceEqual(result.rows[1].values(), ['VA', 'Virginia'])
def test_to_sql_create_statement_zero_width(self): rows = ( (1, ''), (2, ''), ) column_names = ['id', 'name'] column_types = [agate.Number(), agate.Text()] table = agate.Table(rows, column_names, column_types) statement = table.to_sql_create_statement('test_table', db_schema='test_schema', dialect='mysql') self.assertIn('CREATE TABLE test_schema.test_table', statement) self.assertIn('id DECIMAL(38, 0) NOT NULL,', statement) self.assertIn('name VARCHAR(1)', statement)
def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} types = [agate.Text(**type_kwargs)] if not self.args.no_inference: types = [ agate.Boolean(**type_kwargs), agate.Number(locale=self.args.locale, **type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), ] + types return agate.TypeTester(types=types)
def test_lookup_version(self): rows = (('1111', ), ('313320', ), ('522310', )) column_names = ['naics'] column_types = [agate.Text()] table = agate.Table(rows, column_names, column_types) result = table.lookup('naics', 'description', version='2012', source=self._source) self.assertColumnNames(result, ['naics', 'description']) self.assertColumnTypes(result, [agate.Text, agate.Text]) self.assertSequenceEqual(result.rows[1].values(), ['313320', 'Fabric Coating Mills'])
def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} types = [agate.Text(**type_kwargs)] if not self.args.no_inference: types = [ agate.Boolean(**type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), # This is a different order than agate's default, in order to parse dates like "20010101". agate.Number(locale=self.args.locale, **type_kwargs), ] + types return agate.TypeTester(types=types)
def setUp(self): self.rows = ((1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM', '4:15'), (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM', '6:18'), (None, 'b', None, None, None, None)) self.column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', 'timedelta' ] self.column_types = [ agate.Number(), agate.Text(), agate.Boolean(), agate.Date(), agate.DateTime(), agate.TimeDelta() ] self.table = agate.Table(self.rows, self.column_names, self.column_types)