示例#1
0
def load_year_killed_data(year):
    specified_types = {
        'killed': agate.Number(),
        'injured': agate.Number(),
        'date_hour': agate.Text()
    }
    return agate.Table.from_url('https://s3.amazonaws.com/traffic-sd/accidents_killed_{}.csv'.format(year), column_types=specified_types)
示例#2
0
    def _analyze_date(self, event):
        table = self.table.where(
            lambda row: row["fields/date"] is not None).compute([
                (
                    "reduce_to_date",
                    agate.Formula(
                        agate.Text(),
                        lambda row: helpers.reduce_to_date(row["fields/date"]),
                    ),
                ),
                (
                    "reduce_to_year",
                    agate.Formula(
                        agate.Number(),
                        lambda row: helpers.reduce_to_year(row["fields/date"]),
                    ),
                ),
                (
                    "reduce_to_time",
                    agate.Formula(
                        agate.Number(),
                        lambda row: helpers.reduce_to_time(row["fields/date"]),
                    ),
                ),
            ])

        years = table.distinct(
            "reduce_to_year").columns["reduce_to_year"].values()

        _data = {}

        for year in years:
            _data[year] = (table.where(lambda row: row[
                "reduce_to_year"] == year).select("reduce_to_date").pivot(
                    "reduce_to_date").order_by("reduce_to_date"))

        event.set()

        print(f"\n\n{helpers.h1_icn} Date\n")

        for year in years:
            data_keys = list(_data[year].columns["reduce_to_date"].values())
            _counts = list(
                map(int, list(_data[year].columns["Count"].values())))
            _sum = sum(_counts)
            data_count = [[i] for i in _counts]

            args = {
                "color": False,
                "custom_tick": False,
                "start_dt": f"{year}-01-01"
            }

            print(f"\n{helpers.h2_icn} Year {year} ({_sum:,} emails)\n")
            calendar_heatmap(data=data_count, args=args, labels=data_keys)
示例#3
0
def calc_table(in_csv, out_csv):

    table = agate.Table.from_csv(in_csv,
                                 column_names=column_names,
                                 column_types=column_types)
    table = table.pivot('HIEDUC', 'STAYTOG')

    table = table.compute([('Total', agate.Formula(agate.Number(),
                                                   get_total))])
    table = table.compute([('Percent agree',
                            agate.Formula(agate.Number(), get_percent_agree))])
    table.to_csv(out_csv)

    return table
示例#4
0
    def setUp(self):
        self.rows = (
            (1.123, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'),
            (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'),
            (2, u'c', False, '11/5/2015', '11/4/2015 12:45 PM'),
            (None, 'b', None, None, None),
        )

        self.column_names = [
            'number',
            'text',
            'boolean',
            'date',
            'datetime',
        ]

        self.column_types = [
            agate.Number(),
            agate.Text(),
            agate.Boolean(),
            agate.Date(),
            agate.DateTime(),
        ]

        self.table = agate.Table(self.rows, self.column_names,
                                 self.column_types)
        self.connection_string = 'sqlite:///:memory:'
示例#5
0
    def test_create_if_not_exists(self):
        column_names = ['id', 'name']
        column_types = [agate.Number(), agate.Text()]
        rows1 = (
            (1, 'Jake'),
            (2, 'Howard'),
        )
        rows2 = (
            (3, 'Liz'),
            (4, 'Tim'),
        )

        table1 = agate.Table(rows1, column_names, column_types)
        table2 = agate.Table(rows2, column_names, column_types)

        engine = create_engine(self.connection_string)
        connection = engine.connect()

        # Write two agate tables into the same SQL table
        table1.to_sql(connection, 'create_if_not_exists_test', create=True, create_if_not_exists=True, insert=True)
        table2.to_sql(connection, 'create_if_not_exists_test', create=True, create_if_not_exists=True, insert=True)

        table = agate.Table.from_sql(connection, 'create_if_not_exists_test')
        self.assertSequenceEqual(table.column_names, column_names)
        self.assertIsInstance(table.column_types[0], agate.Number)
        self.assertIsInstance(table.column_types[1], agate.Text)
        self.assertEqual(len(table.rows), len(table1.rows) + len(table1.rows))
        self.assertSequenceEqual(table.rows[0], table1.rows[0])
示例#6
0
    def setUp(self):
        self.rows = (
            (1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'),
            (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'),
            (None, 'b', None, None, None),
        )

        self.column_names = [
            'number',
            'text',
            'boolean',
            'date',
            'datetime',
        ]

        self.user_provided_column_names = [
            'number',
            'text',
            'boolean',
            'date',
            'datetime',
        ]

        self.column_types = [
            agate.Number(),
            agate.Text(),
            agate.Boolean(),
            agate.Date(),
            agate.DateTime(),
        ]

        self.table = agate.Table(self.rows, self.column_names,
                                 self.column_types)
示例#7
0
    def setUp(self):
        text_type = agate.Text()
        number_type = agate.Number()

        columns = (
            ('gender', text_type),
            ('month', number_type),
            ('median', number_type),
            ('stdev', number_type),
            ('1st', number_type),
            ('3rd', number_type),
            ('5th', number_type),
            ('15th', number_type),
            ('25th', number_type),
            ('50th', number_type),
            ('75th', number_type),
            ('85th', number_type),
            ('95th', number_type),
            ('97th', number_type),
            ('99th', number_type)
        )

        with open('examples/heights.csv') as f:
            # Create a csv reader
            reader = csv.reader(f)

            # Skip header
            next(f)

            # Create the table
            self.table = agate.Table(reader, columns)

        if os.path.exists(TEST_FILENAME):
            os.remove(TEST_FILENAME)
示例#8
0
def compute_ranks(table):
    table = table.compute([
        ('dataset_rank', agate.Rank('datasets', reverse=True)),
        ('formats_rank', agate.Rank('format_count', reverse=True)),
        #('open_formats_rank', agate.Rank('open_formats', reverse=True)),
        ('last_update_rank', agate.Rank('days_since_last_update')),
        #('open_datasets_rank', agate.Rank('open_datasets', reverse=True)),
        ('category_rank', agate.Rank('category_count', reverse=True)),
        ('category_variance_rank', agate.Rank('category_variance')),
        #('update_start_rank', agate.Rank('days_between_start_and_last_update')),
        #('start_rank', agate.Rank('days_since_start', reverse=True)),
        #('openess_score', agate.Formula(number, openness_score)),
        ('dataset_score_rank', agate.Rank('dataset_score', reverse=True)),
        ('category_score_rank', agate.Rank('category_score', reverse=True)),
    ])
    table = table.compute([
        ('dataset_rank_std', StandadizeScore('dataset_rank')),
        ('formats_rank_std', StandadizeScore('formats_rank')),
        ('last_update_rank_std', StandadizeScore('last_update_rank')),
        ('category_rank_std', StandadizeScore('category_rank')),
        ('category_variance_rank_std', StandadizeScore('category_variance_rank')),
        ('dataset_score_rank_std', StandadizeScore('dataset_score_rank')),
        ('category_score_rank_std', StandadizeScore('category_score_rank')),
        ])
    table = table.compute([
        ('overall_rank_data', agate.Formula(agate.Number(), overall_rank))
    ])
    table = table.compute([
        ('overall_rank', agate.Rank('overall_rank_data')),
        ])
    return table
    def _overall_stats(self):
        count_open_licenses = agate.Summary(
            'license_id', agate.Number(),
            lambda r: sum(license_id in utils.OPEN_LICENSES
                          for license_id in r.values()))

        self.overall_package_stats = self._package_table().aggregate([
            ('open_data_count', count_open_licenses),
        ])
        self.resource_stats = self._package_resource_table().compute([
            ('open_format', agate.Formula(agate.Boolean(),
                                          open_formats_count)),
        ])
        if len(self._package_resource_table()) > 0:
            self.resource_stats = self.resource_stats.aggregate([
                ('open_format_count', agate.Count('open_format', True)),
                ('min_date', agate.Min('created')),
                ('max_date', agate.Max('created'))
            ])
            format_table = self._package_resource_table().group_by(
                "format").aggregate([
                    ('count', agate.Count()),
                ])
            count = format_table.aggregate([
                ('different_formats', agate.Count()),
            ])
            self.open_datasets = self.overall_package_stats.get(
                "open_data_count", 0)
            self.open_format_count = self.resource_stats.get(
                "open_format_count", 0)
            self.format_count = count.get("different_formats", 0)
            self.compute_dates()
 def get_package_stats(self, package_table):
     count_open_licenses = agate.Summary(
         'license_id', agate.Number(),
         lambda r: sum(license_id in utils.OPEN_LICENSES
                       for license_id in r.values()))
     return package_table.aggregate([('open_data_count',
                                      count_open_licenses)])
示例#11
0
    def test_chunk_size(self):
        column_names = ['number']
        column_types = [agate.Number()]

        rows = []
        expected = 0
        for n in range(9999):
            rows.append((n, ))
            expected += n

        engine = create_engine(self.connection_string)
        connection = engine.connect()

        try:
            table = agate.Table(rows, column_names, column_types)
            table.to_sql(connection,
                         'test_chunk_size',
                         overwrite=True,
                         chunk_size=100)

            table = agate.Table.from_sql(connection, 'test_chunk_size')
            actual = sum(r[0] for r in table.rows)
            self.assertEqual(len(table.rows), len(rows))
            self.assertEqual(expected, actual)
        finally:
            connection.close()
            engine.dispose()
示例#12
0
文件: prove.py 项目: Quartz/refugees
def load_data(data):
    """
    Load the dataset.
    """
    text_type = agate.Text()
    number_type = agate.Number()

    columns = OrderedDict([
        ('year', number_type),
        ('residence', text_type),
        ('origin', text_type),
        ('refugees', number_type),
        ('asylum_seekers', number_type),
        ('returned_refugees', number_type),
        ('idps', number_type),
        ('returned_idps', number_type),
        ('stateless_persons', number_type),
        ('others', number_type),
        ('total', number_type),
    ])

    # Load the data
    with open('unhcr_popstats_export_persons_of_concern_2016_01_12_192533.csv'
              ) as f:
        reader = csvkit.reader(f)
        next(reader)

        rows = []

        for row in reader:
            rows.append([None if d == '*' else d for d in row])

        data['table'] = agate.Table(rows, columns.keys(), columns.values())
示例#13
0
def load_data(data):
    text_type = agate.Text()
    number_type = agate.Number()
    boolean_type = agate.Boolean()

    columns = (
        ('last_name', text_type),
        ('first_name', text_type),
        ('age', number_type),
        ('race', text_type),
        ('state', text_type),
        ('tags', text_type),
        ('crime', text_type),
        ('sentence', text_type),
        ('convicted', number_type),
        ('exonerated', number_type),
        ('dna', boolean_type),
        ('dna_essential', text_type),
        ('mistaken_witness', boolean_type),
        ('false_confession', boolean_type),
        ('perjury', boolean_type),
        ('false_evidence', boolean_type),
        ('official_misconduct', boolean_type),
        ('inadequate_defense', boolean_type),
    )

    with open('examples/realdata/exonerations-20150828.csv') as f:
        # Create a csv reader
        reader = csv.reader(f)

        # Skip header
        next(f)

        # Create the table
        data['exonerations'] = agate.Table(reader, columns)
示例#14
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        text_type = agate.Text(**type_kwargs)

        if self.args.no_inference:
            types = [text_type]
        else:
            number_type = agate.Number(locale=self.args.locale, **type_kwargs)

            # See the order in the `agate.TypeTester` class.
            types = [
                agate.Boolean(**type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format,
                               **type_kwargs),
                text_type,
            ]

            # In order to parse dates like "20010101".
            if self.args.date_format or self.args.datetime_format:
                types.insert(-1, number_type)
            else:
                types.insert(1, number_type)

        return agate.TypeTester(types=types)
示例#15
0
    def test_create_if_not_exists(self):
        column_names = ['id', 'name']
        column_types = [agate.Number(), agate.Text()]
        rows1 = (
            (1, 'Jake'),
            (2, 'Howard'),
        )
        rows2 = (
            (3, 'Liz'),
            (4, 'Tim'),
        )

        table1 = agate.Table(rows1, column_names, column_types)
        table2 = agate.Table(rows2, column_names, column_types)

        engine = create_engine(self.connection_string)
        connection = engine.connect()

        # Write two agate tables into the same SQL table
        table1.to_sql(connection,
                      'create_if_not_exists_test',
                      create=True,
                      create_if_not_exists=True,
                      insert=True)
        table2.to_sql(connection,
                      'create_if_not_exists_test',
                      create=True,
                      create_if_not_exists=True,
                      insert=True)
示例#16
0
def from_sql(cls, connection_or_string, table_name):
    """
    Create a new :class:`agate.Table` from a given SQL table. Types will be
    inferred from the database schema.

    Monkey patched as class method :meth:`Table.from_sql`.

    :param connection_or_string:
        An existing sqlalchemy connection or connection string.
    :param table_name:
        The name of a table in the referenced database.
    """
    engine, connection = get_engine_and_connection(connection_or_string)

    metadata = MetaData(connection)
    sql_table = Table(table_name,
                      metadata,
                      autoload=True,
                      autoload_with=connection)

    column_names = []
    column_types = []

    for sql_column in sql_table.columns:
        column_names.append(sql_column.name)

        if type(sql_column.type) in INTERVAL_MAP.values():
            py_type = datetime.timedelta
        else:
            py_type = sql_column.type.python_type

        if py_type in [int, float, decimal.Decimal]:
            if py_type is float:
                sql_column.type.asdecimal = True
            column_types.append(agate.Number())
        elif py_type is bool:
            column_types.append(agate.Boolean())
        elif issubclass(py_type, six.string_types):
            column_types.append(agate.Text())
        elif py_type is datetime.date:
            column_types.append(agate.Date())
        elif py_type is datetime.datetime:
            column_types.append(agate.DateTime())
        elif py_type is datetime.timedelta:
            column_types.append(agate.TimeDelta())
        else:
            raise ValueError('Unsupported sqlalchemy column type: %s' %
                             type(sql_column.type))

    s = select([sql_table])

    rows = connection.execute(s)

    try:
        return agate.Table(rows, column_names, column_types)
    finally:
        if engine is not None:
            connection.close()
            engine.dispose()
示例#17
0
def _add_random_column(data_tbl):
    # Reset seed to produce random numbers
    npr.seed()

    new_table = data_tbl.compute([('random_group',
                                   agate.Formula(agate.Number(),
                                                 _generate_random))])
    return new_table
    def test_load(self):
        tester = agate.TypeTester(
            force={
                'last_name': agate.Text(),
                'first_name': agate.Text(),
                'age': agate.Number()
            })

        exonerations = agate.Table.from_csv(
            '../../../data/exonerations-20150828.csv', column_types=tester)
        print(exonerations)  # 表的描述
示例#19
0
    def test_make_sql_table_min_col_len(self):
        rows = ((1, 'x' * 10), (2, ''))
        column_names = ['id', 'name']
        column_types = [agate.Number(), agate.Text()]
        table = agate.Table(rows, column_names, column_types)

        sql_table = agatesql.table.make_sql_table(table, 'test_table', dialect='mysql', db_schema='test_schema', 
                                                  constraints=True, min_col_len=20)


        self.assertEquals(sql_table.columns.get('name').type.length, 20)
示例#20
0
    def test_to_sql_create_statement_wide_width(self):
        rows = ((1, 'x' * 21845), (2, ''))
        column_names = ['id', 'name']
        column_types = [agate.Number(), agate.Text()]
        table = agate.Table(rows, column_names, column_types)

        statement = table.to_sql_create_statement('test_table', db_schema='test_schema', dialect='mysql')

        self.assertEqual(statement.replace('\t', '  '), '''CREATE TABLE test_schema.test_table (
  id DECIMAL(38, 0) NOT NULL, 
  name TEXT
);''')  # noqa
示例#21
0
def get_types(example_row):
    types = []
    for v in example_row:
        value_type = xlrd.sheet.ctype_text[v.ctype]
        if value_type == 'text':
            types.append(agate.Text())
        elif value_type == 'number':
            types.append(agate.Number())
        elif value_type == 'xldate':
            types.append(agate.Date())
        else:
            types.append(agate.Text())
    return types
示例#22
0
    def test_distinct_values(self):
        column_names: List = [
            'id',
            'name',
            'dob',
            'last seen',
            'size',
            'active',
        ]
        column_types: List = [
            agate.Number(),
            agate.Text(),
            agate.Date(),
            agate.DateTime(),
            agate.Text(),
            agate.Boolean(),
        ]

        rows = [(1, 'Alvin Cotton', '03-01-1980', '06-30-2019 12:12:00', 'L',
                 True),
                (2, 'Usmaan Rojas', '01-12-1978', '06-30-2019 12:12:00', 'S',
                 False),
                (3, 'Kingston Odling', '04-09-1990', '06-30-2019 12:12:00',
                 'M', True),
                (3, 'Pooja Gillespie', '10-07-1985', '06-30-2019 12:12:00',
                 'S', True),
                (4, 'Hal Blake', '08-17-1989', '06-30-2019 12:12:00', 'L',
                 True),
                (5, 'Shannen Blevins', '06-10-1981', '06-30-2019 12:12:00',
                 'M', False),
                (5, 'Courteney Weston', '04-23-1992', '06-30-2019 12:12:00',
                 'M', False),
                (6, 'Conner Calhoun', '05-16-1977', '06-30-2019 12:12:00',
                 'XL', True),
                (7, 'Susie Rasmussen', '02-08-1987', '06-30-2019 12:12:00',
                 'L', False),
                (8, 'Cassie Beltran', '12-15-1982', '06-30-2019 12:12:00', 'M',
                 True)]

        model = csvhound.core.BaseHound()
        table = model.get_table_from_file('sample-data/test-distinct.csv')
        distinct = model.distinct_values('size')
        agate_table = agate.Table(rows, column_names, column_types)
        distinct_agate = agate_table.select('size').distinct('size')

        # now do the testing
        self.assertColumnNames(distinct, ('size', ))
        self.assertColumnTypes(distinct,
                               [type(c) for c in distinct.column_types])
        self.assertRows(distinct, distinct_agate)
def main():
    df = pd.read_csv(SRC_PATH, dtype=str)
    counts = df['date'].value_counts().sort_index()
    # just get first 10 rows and last 10 rows
    counts = pd.concat([
        counts.head(10),
        pd.Series({'...': None}, name='date'),
        counts.tail(10)
    ])
    vals = [
        [k, v] for k, v in counts.to_dict().items()
    ]  # is there really no way to convert a Pandas series to list-of-lists?
    table = agate.Table(vals, ['date', 'count'],
                        [agate.Text(), agate.Number()])
    table.print_bars('date', 'count')
示例#24
0
    def test_to_sql_create_statement_zero_width(self):
        rows = (
            (1, ''),
            (2, ''),
        )
        column_names = ['id', 'name']
        column_types = [agate.Number(), agate.Text()]

        table = agate.Table(rows, column_names, column_types)

        statement = table.to_sql_create_statement('test_table',
                                                  db_schema='test_schema',
                                                  dialect='mysql')

        self.assertIn('CREATE TABLE test_schema.test_table', statement)
        self.assertIn('id DECIMAL(38, 0) NOT NULL,', statement)
        self.assertIn('name VARCHAR(1)', statement)
示例#25
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            text_type = agate.Text(cast_nulls=False)
        else:
            text_type = agate.Text()

        if self.args.no_inference:
            return agate.TypeTester(types=[text_type])
        else:
            return agate.TypeTester(types=[
                agate.Boolean(),
                agate.Number(locale=self.args.locale),
                agate.TimeDelta(),
                agate.Date(date_format=self.args.date_format),
                agate.DateTime(datetime_format=self.args.datetime_format),
                text_type
            ])
示例#26
0
文件: cli.py 项目: leonqli/csvkit
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        types = [agate.Text(**type_kwargs)]

        if not self.args.no_inference:
            types = [
                agate.Boolean(**type_kwargs),
                agate.Number(locale=self.args.locale, **type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs),
            ] + types

        return agate.TypeTester(types=types)
示例#27
0
def sum_counts_by_hour(data):
    data['hour'] = data['table'].group_by('hour').aggregate([
        ('killed', agate.Sum('killed')), ('injured', agate.Sum('injured')),
        ('accidents', agate.Count()),
        ('accidents_injured', count_accidents_injured)
    ]).compute([
        ('killed_percent', agate.Percent('killed')),
        ('injured_percent', agate.Percent('injured')),
        ('accidents_percent', agate.Percent('accidents')),
    ]).compute([
        ('weighted',
         agate.Formula(agate.Number(),
                       lambda r: r['killed_percent'] + r['injured_percent'])),
        ('accidents_within_half_deviation',
         StandardDeviations('accidents', 0.5)),
        ('killed_within_half_deviation', StandardDeviations('killed', 0.5)),
        ('injured_within_half_deviation', StandardDeviations('injured', 0.5))
    ])
    return data
示例#28
0
def update_where(self, update_col, update_val, test_col, test_val):
    # check the types of the update and test columns.
    colnames = self.column_names
    types = [
        self.column_types[colnames.index(update_col)],
        self.column_types[colnames.index(test_col)]
    ]
    # start the magic
    self = self.join(self.select([update_col, test_col]) \
        .rename(column_names = {update_col: 'update_col', test_col: 'test_col'})
        )
    self = self.join(
        agate.Table([[update_val, test_val]] * len(self.rows),
                    ['update_val', 'test_val'], types))
    self = self.compute([('updated',
                          agate.Formula(agate.Number(),
                                        update_where_function))])
    self = self.rename(column_names = {update_col: 'old', 'updated': update_col}) \
        .exclude(['old', 'update_col', 'test_col', 'update_val', 'test_val'])
    return self
示例#29
0
文件: cli.py 项目: v838/csvkit
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        types = [agate.Text(**type_kwargs)]

        if not self.args.no_inference:
            types = [
                agate.Boolean(**type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format,
                               **type_kwargs),
                # This is a different order than agate's default, in order to parse dates like "20010101".
                agate.Number(locale=self.args.locale, **type_kwargs),
            ] + types

        return agate.TypeTester(types=types)
示例#30
0
    def setUp(self):
        self.rows = ((1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM', '4:15'),
                     (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM',
                      '6:18'), (None, 'b', None, None, None, None))

        self.column_names = [
            'number', 'text', 'boolean', 'date', 'datetime', 'timedelta'
        ]

        self.column_types = [
            agate.Number(),
            agate.Text(),
            agate.Boolean(),
            agate.Date(),
            agate.DateTime(),
            agate.TimeDelta()
        ]

        self.table = agate.Table(self.rows, self.column_names,
                                 self.column_types)