Пример #1
0
    def setUp(self):
        self.rows = (
            (1.123, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'),
            (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'),
            (2, u'c', False, '11/5/2015', '11/4/2015 12:45 PM'),
            (None, 'b', None, None, None),
        )

        self.column_names = [
            'number',
            'text',
            'boolean',
            'date',
            'datetime',
        ]

        self.column_types = [
            agate.Number(),
            agate.Text(),
            agate.Boolean(),
            agate.Date(),
            agate.DateTime(),
        ]

        self.table = agate.Table(self.rows, self.column_names,
                                 self.column_types)
        self.connection_string = 'sqlite:///:memory:'
Пример #2
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        text_type = agate.Text(**type_kwargs)

        if self.args.no_inference:
            types = [text_type]
        else:
            number_type = agate.Number(locale=self.args.locale, **type_kwargs)

            # See the order in the `agate.TypeTester` class.
            types = [
                agate.Boolean(**type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format,
                               **type_kwargs),
                text_type,
            ]

            # In order to parse dates like "20010101".
            if self.args.date_format or self.args.datetime_format:
                types.insert(-1, number_type)
            else:
                types.insert(1, number_type)

        return agate.TypeTester(types=types)
Пример #3
0
    def setUp(self):
        self.rows = (
            (1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'),
            (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'),
            (None, 'b', None, None, None),
        )

        self.column_names = [
            'number',
            'text',
            'boolean',
            'date',
            'datetime',
        ]

        self.user_provided_column_names = [
            'number',
            'text',
            'boolean',
            'date',
            'datetime',
        ]

        self.column_types = [
            agate.Number(),
            agate.Text(),
            agate.Boolean(),
            agate.Date(),
            agate.DateTime(),
        ]

        self.table = agate.Table(self.rows, self.column_names,
                                 self.column_types)
Пример #4
0
def from_sql(cls, connection_or_string, table_name):
    """
    Create a new :class:`agate.Table` from a given SQL table. Types will be
    inferred from the database schema.

    Monkey patched as class method :meth:`Table.from_sql`.

    :param connection_or_string:
        An existing sqlalchemy connection or connection string.
    :param table_name:
        The name of a table in the referenced database.
    """
    engine, connection = get_engine_and_connection(connection_or_string)

    metadata = MetaData(connection)
    sql_table = Table(table_name,
                      metadata,
                      autoload=True,
                      autoload_with=connection)

    column_names = []
    column_types = []

    for sql_column in sql_table.columns:
        column_names.append(sql_column.name)

        if type(sql_column.type) in INTERVAL_MAP.values():
            py_type = datetime.timedelta
        else:
            py_type = sql_column.type.python_type

        if py_type in [int, float, decimal.Decimal]:
            if py_type is float:
                sql_column.type.asdecimal = True
            column_types.append(agate.Number())
        elif py_type is bool:
            column_types.append(agate.Boolean())
        elif issubclass(py_type, six.string_types):
            column_types.append(agate.Text())
        elif py_type is datetime.date:
            column_types.append(agate.Date())
        elif py_type is datetime.datetime:
            column_types.append(agate.DateTime())
        elif py_type is datetime.timedelta:
            column_types.append(agate.TimeDelta())
        else:
            raise ValueError('Unsupported sqlalchemy column type: %s' %
                             type(sql_column.type))

    s = select([sql_table])

    rows = connection.execute(s)

    try:
        return agate.Table(rows, column_names, column_types)
    finally:
        if engine is not None:
            connection.close()
            engine.dispose()
Пример #5
0
def get_types(example_row):
    types = []
    for v in example_row:
        value_type = xlrd.sheet.ctype_text[v.ctype]
        if value_type == 'text':
            types.append(agate.Text())
        elif value_type == 'number':
            types.append(agate.Number())
        elif value_type == 'xldate':
            types.append(agate.Date())
        else:
            types.append(agate.Text())
    return types
Пример #6
0
    def test_distinct_values(self):
        column_names: List = [
            'id',
            'name',
            'dob',
            'last seen',
            'size',
            'active',
        ]
        column_types: List = [
            agate.Number(),
            agate.Text(),
            agate.Date(),
            agate.DateTime(),
            agate.Text(),
            agate.Boolean(),
        ]

        rows = [(1, 'Alvin Cotton', '03-01-1980', '06-30-2019 12:12:00', 'L',
                 True),
                (2, 'Usmaan Rojas', '01-12-1978', '06-30-2019 12:12:00', 'S',
                 False),
                (3, 'Kingston Odling', '04-09-1990', '06-30-2019 12:12:00',
                 'M', True),
                (3, 'Pooja Gillespie', '10-07-1985', '06-30-2019 12:12:00',
                 'S', True),
                (4, 'Hal Blake', '08-17-1989', '06-30-2019 12:12:00', 'L',
                 True),
                (5, 'Shannen Blevins', '06-10-1981', '06-30-2019 12:12:00',
                 'M', False),
                (5, 'Courteney Weston', '04-23-1992', '06-30-2019 12:12:00',
                 'M', False),
                (6, 'Conner Calhoun', '05-16-1977', '06-30-2019 12:12:00',
                 'XL', True),
                (7, 'Susie Rasmussen', '02-08-1987', '06-30-2019 12:12:00',
                 'L', False),
                (8, 'Cassie Beltran', '12-15-1982', '06-30-2019 12:12:00', 'M',
                 True)]

        model = csvhound.core.BaseHound()
        table = model.get_table_from_file('sample-data/test-distinct.csv')
        distinct = model.distinct_values('size')
        agate_table = agate.Table(rows, column_names, column_types)
        distinct_agate = agate_table.select('size').distinct('size')

        # now do the testing
        self.assertColumnNames(distinct, ('size', ))
        self.assertColumnTypes(distinct,
                               [type(c) for c in distinct.column_types])
        self.assertRows(distinct, distinct_agate)
Пример #7
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            text_type = agate.Text(cast_nulls=False)
        else:
            text_type = agate.Text()

        if self.args.no_inference:
            return agate.TypeTester(types=[text_type])
        else:
            return agate.TypeTester(types=[
                agate.Boolean(),
                agate.Number(locale=self.args.locale),
                agate.TimeDelta(),
                agate.Date(date_format=self.args.date_format),
                agate.DateTime(datetime_format=self.args.datetime_format),
                text_type
            ])
Пример #8
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        types = [agate.Text(**type_kwargs)]

        if not self.args.no_inference:
            types = [
                agate.Boolean(**type_kwargs),
                agate.Number(locale=self.args.locale, **type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs),
            ] + types

        return agate.TypeTester(types=types)
Пример #9
0
    def setUp(self):
        self.rows = ((1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM', '4:15'),
                     (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM',
                      '6:18'), (None, 'b', None, None, None, None))

        self.column_names = [
            'number', 'text', 'boolean', 'date', 'datetime', 'timedelta'
        ]

        self.column_types = [
            agate.Number(),
            agate.Text(),
            agate.Boolean(),
            agate.Date(),
            agate.DateTime(),
            agate.TimeDelta()
        ]

        self.table = agate.Table(self.rows, self.column_names,
                                 self.column_types)
Пример #10
0
Файл: cli.py Проект: v838/csvkit
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        types = [agate.Text(**type_kwargs)]

        if not self.args.no_inference:
            types = [
                agate.Boolean(**type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format,
                               **type_kwargs),
                # This is a different order than agate's default, in order to parse dates like "20010101".
                agate.Number(locale=self.args.locale, **type_kwargs),
            ] + types

        return agate.TypeTester(types=types)
Пример #11
0
    def setUp(self):
        self.rows = (('1', 'a', 'True', '10/01/2015', '10/01/2015 12:30 PM',
                      '4h45m'), ('2', 'b', 'False', '11/01/2015',
                                 '11/01/2015 12:45 PM', '3h25m'), ('', '', '',
                                                                   '', '', ''))

        self.number_type = agate.Number()
        self.text_type = agate.Text()
        self.boolean_type = agate.Boolean()
        self.date_type = agate.Date()
        self.datetime_type = agate.DateTime()
        self.timedelta_type = agate.TimeDelta()

        self.column_names = ('number', 'text', 'boolean', 'date', 'datetime',
                             'timedelta')
        self.column_types = (self.number_type, self.text_type,
                             self.boolean_type, self.date_type,
                             self.datetime_type, self.timedelta_type)

        self.table = agate.Table(self.rows,
                                 zip(self.column_names, self.column_types))
Пример #12
0
print(title_rows)

titles = [t[0] + ' ' + t[1] for t in title_rows]
titles = [t.strip() for t in titles]
titles

country_rows = [sheet.row_values(r) for r in range(6, 114)]
country_rows

from xlrd.sheet import ctype_text
import agate

text_type = agate.Text()
number_type = agate.Number()
boolean_type = agate.Boolean()
date_type = agate.Date()

example_row = sheet.row(6)
print example_row
print example_row[0].ctype
print example_row[0].value
print ctype_text

types = []

for v in example_row:
    value_type = ctype_text[v.ctype]
    if value_type == 'text':
        types.append(text_type)
    elif value_type == 'number':
        types.append(number_type)
Пример #13
0
def from_xls(cls,
             path,
             sheet=None,
             skip_lines=0,
             header=True,
             encoding_override=None,
             row_limit=None,
             column_names=None,
             column_types=None,
             **kwargs):
    """
    Parse an XLS file.

    :param path:
        Path to an XLS file to load or a file-like object for one.
    :param sheet:
        The names or integer indices of the worksheets to load. If not specified
        then the first sheet will be used.
    :param skip_lines:
        The number of rows to skip from the top of the sheet.
    :param header:
        If :code:`True`, the first row is assumed to contain column names.
    :param row_limit:
        Limit how many rows of data will be read
    :param column_names:
        See :meth:`.Table.__init__`.
    :param column_types:
        See :meth:`.Table.__init__`.
    """
    if not isinstance(skip_lines, int):
        raise ValueError('skip_lines argument must be an int')

    def open_workbook(f):
        try:
            book = xlrd.open_workbook(file_contents=f.read(),
                                      encoding_override=encoding_override,
                                      on_demand=True)
        except xlrd.compdoc.CompDocError:
            # This is not a pure XLS file; we'll try to read it though.
            # Let's try the Compound File Binary Format:
            ole = olefile.OleFileIO(f)
            if ole.exists('Workbook'):
                d = ole.openstream('Workbook')
                book = xlrd.open_workbook(file_contents=d.read(),
                                          on_demand=True)
            else:
                raise IOError('No Workbook stream found in OLE file')
        return book

    if hasattr(path, 'read'):
        book = open_workbook(path)
    else:
        with open(path, 'rb') as f:
            book = open_workbook(f)

    try:
        multiple = agate.utils.issequence(sheet)
        if multiple:
            sheets = sheet
        else:
            sheets = [sheet]

        tables = OrderedDict()

        for i, sheet in enumerate(sheets):
            if isinstance(sheet, six.string_types):
                sheet = book.sheet_by_name(sheet)
            elif isinstance(sheet, int):
                sheet = book.sheet_by_index(sheet)
            else:
                sheet = book.sheet_by_index(0)

            if header:
                offset = 1
                column_names_detected = []
            else:
                offset = 0
                column_names_detected = None

            columns = []
            column_types_detected = []

            for i in range(sheet.ncols):
                if row_limit is None:
                    values = sheet.col_values(i, skip_lines + offset)
                    types = sheet.col_types(i, skip_lines + offset)
                else:
                    values = sheet.col_values(i, skip_lines + offset,
                                              skip_lines + offset + row_limit)
                    types = sheet.col_types(i, skip_lines + offset,
                                            skip_lines + offset + row_limit)
                excel_type = determine_excel_type(types)
                agate_type = determine_agate_type(excel_type)

                if excel_type == xlrd.biffh.XL_CELL_BOOLEAN:
                    values = normalize_booleans(values)
                elif excel_type == xlrd.biffh.XL_CELL_DATE:
                    values, with_date, with_time = normalize_dates(
                        values, book.datemode)
                    if not with_date:
                        agate_type = agate.TimeDelta()
                    if not with_time:
                        agate_type = agate.Date()

                if header:
                    name = six.text_type(sheet.cell_value(skip_lines,
                                                          i)) or None
                    column_names_detected.append(name)

                columns.append(values)
                column_types_detected.append(agate_type)

            rows = []

            if columns:
                for i in range(len(columns[0])):
                    rows.append([c[i] for c in columns])

            if column_names is None:
                sheet_column_names = column_names_detected
            else:
                sheet_column_names = column_names

            sheet_column_types = column_types
            if isinstance(column_types,
                          dict) and sheet_column_names is not None:
                sheet_column_types = dict(
                    zip(sheet_column_names, column_types_detected))
                sheet_column_types.update(column_types)

            tables[sheet.name] = agate.Table(rows, sheet_column_names,
                                             sheet_column_types, **kwargs)

    finally:
        book.release_resources()

    if multiple:
        return agate.MappedSequence(tables.values(), tables.keys())
    else:
        return tables.popitem()[1]
Пример #14
0
        print(impressions)
        return None, None


INSERT_QUERY = "INSERT INTO creative_stats ({}) VALUES ({}) ON CONFLICT (ad_id) DO UPDATE SET {}".format(', '.join([k for k in KEYS]), ', '.join([":" + k for k in KEYS]), ', '.join([f"{k} = :{k}" for k in KEYS]))

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]


# specifying column types saves 50% of time in loading the CSV! (30min w/o, 15min w/)
CREATIVE_STATS_COLUMN_TYPES = {'Ad_ID': agate.Text(), 'Ad_URL': agate.Text(), 'Ad_Type': agate.Text(), 
                    'Regions': agate.Text(), 'Advertiser_ID': agate.Text(), 'Advertiser_Name': agate.Text(), 
                    'Ad_Campaigns_List': agate.Boolean(), 'Date_Range_Start': agate.Date(), 'Date_Range_End': agate.Date(), 
                    'Num_of_Days': agate.Number(), 'Impressions': agate.Text(), 'Spend_USD': agate.Text(), 
                    'First_Served_Timestamp': agate.DateTime(), 'Last_Served_Timestamp': agate.DateTime(), 
                    'Age_Targeting': agate.Text(), 'Gender_Targeting': agate.Text(), 'Geo_Targeting_Included': agate.Text(), 'Geo_Targeting_Excluded': agate.Text(), 
                    'Spend_Range_Min_USD': agate.Number(), 'Spend_Range_Max_USD': agate.Number(), 'Spend_Range_Min_EUR': agate.Number(), 'Spend_Range_Max_EUR': agate.Number(), 'Spend_Range_Min_INR': agate.Number(), 'Spend_Range_Max_INR': agate.Number(), 'Spend_Range_Min_BGN': agate.Number(), 'Spend_Range_Max_BGN': agate.Number(), 'Spend_Range_Min_HRK': agate.Number(), 'Spend_Range_Max_HRK': agate.Number(), 'Spend_Range_Min_CZK': agate.Number(), 'Spend_Range_Max_CZK': agate.Number(), 'Spend_Range_Min_DKK': agate.Number(), 'Spend_Range_Max_DKK': agate.Number(), 'Spend_Range_Min_HUF': agate.Number(), 'Spend_Range_Max_HUF': agate.Number(), 'Spend_Range_Min_PLN': agate.Number(), 'Spend_Range_Max_PLN': agate.Number(), 'Spend_Range_Min_RON': agate.Number(), 'Spend_Range_Max_RON': agate.Number(), 'Spend_Range_Min_SEK': agate.Number(), 'Spend_Range_Max_SEK': agate.Number(), 'Spend_Range_Min_GBP': agate.Number(), 'Spend_Range_Max_GBP': agate.Number(), 'Spend_Range_Min_NZD': agate.Number(), 'Spend_Range_Max_NZD': agate.Number()}


OLD_CREATIVE_STATS_COLUMN_TYPES = {'Ad_ID': agate.Text(), 'Ad_URL': agate.Text(), 'Ad_Type': agate.Text(), 
                    'Regions': agate.Text(), 'Advertiser_ID': agate.Text(), 'Advertiser_Name': agate.Text(), 
                    'Ad_Campaigns_List': agate.Text(), 'Date_Range_Start': agate.Date(), 'Date_Range_End': agate.Date(), 
                    'Num_of_Days': agate.Number(), 'Impressions': agate.Text(), 'Spend_USD': agate.Text(), 
                    }


CREATIVE_STATS_SCHEMA_CHANGE_DATE = date(2020, 7, 1) # it's sometime around here, I don't know for sure, that the schema changes
# Date:   02/23/2020
# Course: DSC-540 - Data Preparation
# Desc:   Practice joining numerous datasets – an activity you will likely run into frequently. Following the example
#         in your text that starts on page 229 – 233 of Data Wrangling with Python, work through the example to bring
#         two datasets together.
# Usage:  This program is to complete assignment 11.2 requirements
#
# Import required packages
import xlrd
import agate
from xlrd.sheet import ctype_text

text_type = agate.Text()  # define text type
number_type = agate.Number()  # define number type
boolean_type = agate.Boolean()  # define boolean type
date_type = agate.Date()  # define date type


def remove_bad_chars(val):
    """ This method remove bad character from data. If it is '-' it returns none
    :param val: input string data
    :return: input string or none
    """
    if val == '-':
        return None
    return val


def get_types(example_row):
    """
    This routine based on data in a row determines the column type
Пример #16
0
import re
import six

MSO_NUMBER_FORMAT_TO_AGATE_TYPE = {
    r'0': agate.Number(),
    r'0\.0': agate.Number(),
    r'0\.00': agate.Number(),
    r'0\.000': agate.Number(),
    r'0\.0000': agate.Number(),
    r'0\.E+00': agate.Number(),
    r'0%': agate.Number(),
    r'Percent': agate.Number(),
    r'\#\ ?\/?': agate.Number(),
    r'\#\ ??\/??': agate.Number(),
    r'\#\ ???\/???': agate.Number(),
    r'Short Date': agate.Date(date_format='%d/%m/%Y'),
    r'Medium Date': agate.Date(date_format='%d-%b-%y'),
    r'Long Date': agate.Date(date_format=''),
    r'Short Time': agate.DateTime(datetime_format='%H:%M'),
    r'Medium Time': agate.DateTime(datetime_format='%I:%M %p'),
    r'Long Time': agate.DateTime(datetime_format='%H:%M:%S:%f'),
    r'\@': agate.Text(),
    # TODO add mm\/dd\/yy and so on...
}


def from_html(cls,
              path,
              table_identifier=0,
              header=True,
              encoding='utf-8',
Пример #17
0
import agate
import agatecharts

agatecharts.patch()

OUTPUT_DIR = 'docs/samples'

if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

for filename in os.listdir(OUTPUT_DIR):
    os.remove(os.path.join(OUTPUT_DIR, filename))

tester = agate.TypeTester(force={
    ' Date': agate.Date('%Y-%m-%d')
})

emissions = agate.Table.from_csv('examples/epa-emissions-20150910.csv', tester)

emissions = emissions.compute([
    (agate.Formula(agate.Number(), lambda r: r[' Date'].day), 'day'),
    (agate.Formula(agate.Number(), lambda r: r[' SO2 (tons)'] or 0), 'so2'),
    (agate.Formula(agate.Number(), lambda r: r[' NOx (tons)'] or 0), 'noX'),
    (agate.Formula(agate.Number(), lambda r: r[' CO2 (short tons)'] or 0), 'co2')
])

states = emissions.group_by('State')
state_totals = states.aggregate([
    ('so2', agate.Sum(), 'so2'),
    ('co2', agate.Sum(), 'co2'),
Пример #18
0
def from_xls(cls,
             path,
             sheet=None,
             skip_lines=0,
             header=True,
             encoding_override=None,
             **kwargs):
    """
    Parse an XLS file.

    :param path:
        Path to an XLS file to load or a file-like object for one.
    :param sheet:
        The names or integer indices of the worksheets to load. If not specified
        then the first sheet will be used.
    :param skip_lines:
        The number of rows to skip from the top of the sheet.
    :param header:
        If :code:`True`, the first row is assumed to contain column names.
    """
    if not isinstance(skip_lines, int):
        raise ValueError('skip_lines argument must be an int')

    if hasattr(path, 'read'):
        book = xlrd.open_workbook(file_contents=path.read(),
                                  encoding_override=encoding_override)
    else:
        with open(path, 'rb') as f:
            book = xlrd.open_workbook(file_contents=f.read(),
                                      encoding_override=encoding_override)

    multiple = agate.utils.issequence(sheet)
    if multiple:
        sheets = sheet
    else:
        sheets = [sheet]

    tables = OrderedDict()

    for i, sheet in enumerate(sheets):
        if isinstance(sheet, six.string_types):
            sheet = book.sheet_by_name(sheet)
        elif isinstance(sheet, int):
            sheet = book.sheet_by_index(sheet)
        else:
            sheet = book.sheet_by_index(0)

        if header:
            offset = 1
            column_names = []
        else:
            offset = 0
            column_names = None

        columns = []
        column_types = []

        for i in range(sheet.ncols):
            data = sheet.col_values(i)
            values = data[skip_lines + offset:]
            types = sheet.col_types(i)[skip_lines + offset:]
            excel_type = determine_excel_type(types)
            agate_type = determine_agate_type(excel_type)

            if excel_type == xlrd.biffh.XL_CELL_BOOLEAN:
                values = normalize_booleans(values)
            elif excel_type == xlrd.biffh.XL_CELL_DATE:
                values, with_date, with_time = normalize_dates(
                    values, book.datemode)
                if not with_date:
                    agate_type = agate.TimeDelta()
                if not with_time:
                    agate_type = agate.Date()

            if header:
                name = six.text_type(data[skip_lines]) or None
                column_names.append(name)

            columns.append(values)
            column_types.append(agate_type)

        rows = []

        if columns:
            for i in range(len(columns[0])):
                rows.append([c[i] for c in columns])

        if 'column_names' in kwargs:
            if not header:
                column_names = kwargs['column_names']
            del kwargs['column_names']

        if 'column_types' in kwargs:
            column_types = kwargs['column_types']
            del kwargs['column_types']

        tables[sheet.name] = agate.Table(rows, column_names, column_types,
                                         **kwargs)

    if multiple:
        return agate.MappedSequence(tables.values(), tables.keys())
    else:
        return tables.popitem()[1]
Пример #19
0
# https://gist.github.com/jo-tez/7f0a6bad2cd6731d8db16d9542719edb

import agate
import numpy.random as npr
import isodate
from faker import Factory

random_groups = npr.choice(3, 100, p=[.25, .65, .10])

column_names = ['id', 'name', 'hire_date', 'pb', 'salary']

column_types = [
    agate.Number(),
    agate.Text(),
    agate.Date(),
    agate.Number(),
    agate.Number()
]


def generate_test_data():
    # Set seed to generate consistent test data
    npr.seed(1)

    data_lists = []
    n_recs = 110
    fk = Factory.create()

    for i in range(n_recs):
        payband = npr.choice([1, 2, 3], p=[0.7, 0.25, 0.05])
        payband = int(payband)