Exemplos de Schema em Python, exemplos de tableschema.Schema em Python

Exemplo n.º 1

0

Exibir arquivo

    def write(self,
              bucket,
              rows,
              keyed=False,
              as_generator=False,
              update_keys=None):
        """https://github.com/frictionlessdata/tableschema-sql-py#storage
        """

        # Check update keys
        if update_keys is not None and len(update_keys) == 0:
            message = 'Argument "update_keys" cannot be an empty list'
            raise tableschema.exceptions.StorageError(message)

        # Get table and description
        table = self.__get_table(bucket)
        schema = tableschema.Schema(self.describe(bucket))
        fallbacks = self.__fallbacks.get(bucket, [])

        # Write rows to table
        convert_row = partial(self.__mapper.convert_row,
                              schema=schema,
                              fallbacks=fallbacks)
        writer = Writer(table, schema, update_keys, self.__autoincrement,
                        convert_row)
        with self.__connection.begin():
            gen = writer.write(rows, keyed=keyed)
            if as_generator:
                return gen
            collections.deque(gen, maxlen=0)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: export.py Projeto: josiahjohnston/pudl

def get_table_schema(table):
    """
    Create a Table Schema descriptor from an SQL Alchemy table.

    See: https://frictionlessdata.io/specs/table-schema/

    There are four possible elements in the Table Schema:
      * fields (an array of field descriptors)
      * primaryKey
      * foreignKeys (an array of foreignKey objects)
      * missingValues (an array of strings to be interpreted as null)

    """
    descriptor = {}
    descriptor['fields'] = get_fields(table)
    descriptor['primaryKey'] = get_primary_key(table)
    fkeys = get_foreign_keys(table)
    if fkeys:
        descriptor['foreignKeys'] = fkeys
    descriptor['missingValues'] = get_missing_values(table)

    schema = tableschema.Schema(descriptor)
    if not schema.valid:
        raise AssertionError(f"""
            Invalid table schema for {table}

            Errors:
            {schema.errors}
            """)
    return descriptor

Exemplo n.º 3

0

Exibir arquivo

 def test_validate_invalid_milestone_name(self):
     bad_schema = tableschema.Schema(TEST_SCHEMA2.descriptor)
     bad_schema.descriptor["fields"][4]["custom_milestone_field_names"] = {
         "10": "bad_milestone_name"
     }
     bad_schema.commit()
     self.assertFalse(table_schema.validate_schema(bad_schema))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: test_storage.py Projeto: frictionlessdata/datapackage-pipelines-sql-driver

def cast(resource, skip=[]):
    resource = deepcopy(resource)
    schema = tableschema.Schema(resource['schema'])
    for row in resource['data']:
        for index, field in enumerate(schema.fields):
            if field.type not in skip:
                row[index] = field.cast_value(row[index])
    return resource

Exemplo n.º 5

0

Exibir arquivo

    def write(self,
              bucket,
              rows,
              keyed=False,
              as_generator=False,
              update_keys=None,
              buffer_size=1000,
              use_bloom_filter=True):
        """Write to bucket

        # Arguments
            keyed (bool):
                accept keyed rows
            as_generator (bool):
                returns generator to provide writing control to the client
            update_keys (str[]):
                update instead of inserting if key values match existent rows
            buffer_size (int=1000):
                maximum number of rows to try and write to the db in one batch
            use_bloom_filter (bool=True):
                should we use a bloom filter to optimize DB update performance
                (in exchange for some setup time)

        """

        # Check update keys
        if update_keys is not None and len(update_keys) == 0:
            message = 'Argument "update_keys" cannot be an empty list'
            raise tableschema.exceptions.StorageError(message)

        # Get table and description
        table = self.__get_table(bucket)
        schema = tableschema.Schema(self.describe(bucket))
        fallbacks = self.__fallbacks.get(bucket, [])

        # Write rows to table
        convert_row = partial(self.__mapper.convert_row,
                              schema=schema,
                              fallbacks=fallbacks)
        autoincrement = self.__get_autoincrement_for_bucket(bucket)
        writer = Writer(
            table,
            schema,
            # Only PostgreSQL supports "returning" so we don't use autoincrement for all
            autoincrement=autoincrement
            if self.__dialect in ['postgresql'] else None,
            update_keys=update_keys,
            convert_row=convert_row,
            buffer_size=buffer_size,
            use_bloom_filter=use_bloom_filter)
        with self.__connection.begin():
            gen = writer.write(rows, keyed=keyed)
            if as_generator:
                return gen
            collections.deque(gen, maxlen=0)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: test_infer.py Projeto: bz2/tableschema-py

    def test_infer_schema_row_limit(self):
        filepath = os.path.join(self.data_dir, 'data_infer_row_limit.csv')
        with io.open(filepath) as stream:
            headers = stream.readline().rstrip('\n').split(',')
            values = tableschema.compat.csv_reader(stream)
            schema = tableschema.infer(headers, values, row_limit=4)
        schema_model = tableschema.Schema(schema)

        self.assertEqual(schema_model.get_field('id').type, 'integer')
        self.assertEqual(schema_model.get_field('age').type, 'integer')
        self.assertEqual(schema_model.get_field('name').type, 'string')

Exemplo n.º 7

0

Exibir arquivo

Arquivo: test_infer.py Projeto: bz2/tableschema-py

    def test_infer_schema_primary_key_list(self):
        primary_key = ['id', 'age']
        filepath = os.path.join(self.data_dir, 'data_infer.csv')
        with io.open(filepath) as stream:
            headers = stream.readline().rstrip('\n').split(',')
            values = tableschema.compat.csv_reader(stream)
            schema = tableschema.infer(headers,
                                       values,
                                       primary_key=primary_key)
        schema_model = tableschema.Schema(schema)

        self.assertTrue(schema_model.primary_key, primary_key)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: storage.py Projeto: frictionlessdata/datapackage-pipelines-ckan-driver

    def iter(self, bucket):
        schema = tableschema.Schema(self.describe(bucket))

        datastore_search_url = \
            "{}/datastore_search".format(self.__base_endpoint)
        params = {'resource_id': bucket}
        response = self._make_ckan_request(datastore_search_url, params=params)
        while response['result']['records']:
            for row in response['result']['records']:
                row = self.__mapper.restore_row(row, schema=schema)
                yield row
            next_url = self.__base_url + response['result']['_links']['next']
            response = self._make_ckan_request(next_url)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: resolve.py Projeto: aodn/python-aodncore

 def schema(self):
     return tableschema.Schema(
         {
             'fields': [{
                 'name': 'dest_path',
                 'type': 'string',
                 'constraints': {
                     'required': True,
                     'unique': True
                 }
             }]
         },
         strict=True)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: set_types.py Projeto: strets123/datapackage-pipelines

def process_resource(spec, rows):
    schema = spec['schema']
    jts = tableschema.Schema(schema)
    field_names = list(map(lambda f: f['name'], schema['fields']))
    for row in rows:
        flattened_row = [row.get(name) for name in field_names]
        try:
            flattened_row = jts.cast_row(flattened_row)
        except Exception:
            logging.error('Failed to cast row %r', flattened_row)
            raise
        row = dict(zip(field_names, flattened_row))
        yield row

Exemplo n.º 11

0

Exibir arquivo

Arquivo: test_storage.py Projeto: decentral1se/tableschema-pandas-py

def cast(resource, skip=[], wrap={}, wrap_each={}):
    resource = deepcopy(resource)
    schema = tableschema.Schema(resource['schema'])
    for row in resource['data']:
        for index, field in enumerate(schema.fields):
            value = row[index]
            if field.type not in skip:
                value = field.cast_value(value)
            if field.type in wrap:
                value = wrap[field.type](value)
            if field.type in wrap_each:
                value = list(map(wrap_each[field.type], value))
            row[index] = value
    return resource

Exemplo n.º 12

0

Exibir arquivo

def process_one(rows, field, op, arg):
    spec = rows.spec
    mutated_field = field
    fields = spec['schema']['fields']
    schema = {
        'fields': list(filter(lambda f: f['name'] == mutated_field, fields))
    }
    jts = tableschema.Schema(schema).fields[0]
    arg = jts.cast_value(arg)
    op = OPS[op]

    for row in rows:
        if op(arg, row.get(field)):
            yield row

Exemplo n.º 13

0

Exibir arquivo

    def iter(self, bucket):

        # Check existense
        if bucket not in self.buckets:
            message = 'Bucket "%s" doesn\'t exist.' % bucket
            raise tableschema.exceptions.StorageError(message)

        # Prepare
        descriptor = self.describe(bucket)
        schema = tableschema.Schema(descriptor)

        # Yield rows
        for pk, row in self.__dataframes[bucket].iterrows():
            row = self.__mapper.restore_row(row, schema=schema, pk=pk)
            yield row

Exemplo n.º 14

0

Exibir arquivo

def add_extended_metadata(filename, metadata):
    """Update the given metadata dict with additional tableschema metadata.
    This is only available for files that can be read by tableschema, which are
    only tsvs and csvs. Tableschema doesn't add much, but it could be handy
    if it can detect extended types like locations."""
    metadata['previewbytes'] = get_preview_byte_count(filename)
    try:
        ts_info = tableschema.Schema(tableschema.infer(filename)).descriptor

        new_field_definitions = []
        for m, ts in zip(metadata['field_definitions'], ts_info['fields']):
            m['format'] = ts['format']
            new_field_definitions.append(m)
        metadata['field_definitions'] = new_field_definitions
    except tabulator.exceptions.FormatError:
        pass
    return metadata

Exemplo n.º 15

0

Exibir arquivo

    def iter(self, bucket):

        # Get table and fallbacks
        table = self.__get_table(bucket)
        schema = tableschema.Schema(self.describe(bucket))
        autoincrement = self.__get_autoincrement_for_bucket(bucket)

        # Open and close transaction
        with self.__connection.begin():
            # Streaming could be not working for some backends:
            # http://docs.sqlalchemy.org/en/latest/core/connections.html
            select = table.select().execution_options(stream_results=True)
            result = select.execute()
            for row in result:
                row = self.__mapper.restore_row(
                    row, schema=schema, autoincrement=autoincrement)
                yield row

Exemplo n.º 16

0

Exibir arquivo

Arquivo: storage.py Projeto: frictionlessdata/datapackage-pipelines-ckan-driver

 def write_aux(self, bucket, rows, method="upsert"):
     schema = tableschema.Schema(self.describe(bucket))
     datastore_upsert_url = \
         "{}/datastore_upsert".format(self.__base_endpoint)
     records = []
     for r in rows:
         records.append(self.__mapper.convert_row(r, schema))
         yield r
     params = {
         'resource_id': bucket,
         'method': method,
         'force': True,
         'records': records
     }
     self._make_ckan_request(datastore_upsert_url,
                             method='POST',
                             json=params)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: storage.py Projeto: dmlb2000/tableschema-sql-py

    def iter(self, bucket):
        """https://github.com/frictionlessdata/tableschema-sql-py#storage
        """

        # Get table and fallbacks
        table = self.__get_table(bucket)
        schema = tableschema.Schema(self.describe(bucket))

        # Open and close transaction
        with self.__connection.begin():
            # Streaming could be not working for some backends:
            # http://docs.sqlalchemy.org/en/latest/core/connections.html
            select = table.select().execution_options(stream_results=True)
            result = select.execute()
            for row in result:
                row = self.__mapper.restore_row(row, schema=schema)
                yield row

Exemplo n.º 18

0

Exibir arquivo

Arquivo: analysis.py Projeto: NickolausDS/pilot1-tools

def analyze_dataframe(filename, foreign_keys=None):
    # Pandas analysis
    df = pandas.read_csv(filename, sep='\t')
    pandas_info = df.describe(include='all')
    # Tableschema analysis
    ts_info = tableschema.Schema(tableschema.infer(filename)).descriptor

    column_metadata = []
    for column in ts_info['fields'][:10]:
        df_metadata = column.copy()
        col_name = column['name']
        df_metadata.update(get_pandas_field_metadata(pandas_info, col_name))
        df_metadata.update(get_foreign_key(foreign_keys, column))
        column_metadata.append(df_metadata)

    dataframe_metadata = {
        'name': 'Data Dictionary',
        # df.shape[0] seems to have issues determining rows
        'numrows': len(df.index),
        'numcols': df.shape[1],
        'previewbytes': get_preview_byte_count(filename),
        'field_definitions': column_metadata,
        'labels': {
            'name': 'Column Name',
            'type': 'Data Type',
            'format': 'Format',
            'count': 'Number of non-null entries',
            '25': '25th Percentile',
            '50': '50th Percentile',
            '75': '75th Percentile',
            'std': 'Standard Deviation',
            'mean': 'Mean Value',
            'min': 'Minimum Value',
            'max': 'Maximum Value',
            'unique': 'Unique Values',
            'top': 'Top Common',
            'frequency': 'Frequency of Top Common Value',
            'reference': 'Link to resource definition'
        }
    }
    return dataframe_metadata

Exemplo n.º 19

0

Exibir arquivo

def process_resource(spec, rows, parameters, logger: LoggerImpl):
    mutated_field = parameters['field']
    fields = spec['schema']['fields']
    schema = {
        'fields': list(filter(lambda f: f['name'] == mutated_field, fields))
    }
    jts = tableschema.Schema(schema).fields[0]
    bad_count = 0
    for i, row in enumerate(rows):
        if bad_count > 100:
            yield row
            continue

        value = row.get(mutated_field)
        try:
            value = jts.cast_value(value)
            row[mutated_field] = value
            yield row

        except Exception:
            bad_count += 1
            logger.bad_value(spec['name'], i, dict(row), mutated_field,
                             row.get(mutated_field))
            yield row

Exemplo n.º 20

0

Exibir arquivo

Arquivo: mapper.py Projeto: sanbales/tableschema-pandas-py

    def convert_descriptor_and_rows(self, descriptor, rows):
        """Convert descriptor and rows to Pandas
        """

        # Prepare
        primary_key = None
        schema = tableschema.Schema(descriptor)
        if len(schema.primary_key) == 1:
            primary_key = schema.primary_key[0]
        elif len(schema.primary_key) > 1:
            message = 'Multi-column primary keys are not supported'
            raise tableschema.exceptions.StorageError(message)

        # Get data/index
        data_rows = []
        index_rows = []
        jtstypes_map = {}
        for row in rows:
            values = []
            index = None
            for field, value in zip(schema.fields, row):
                try:
                    if isinstance(value, float) and np.isnan(value):
                        value = None
                    if value and field.type == 'integer':
                        value = int(value)
                    value = field.cast_value(value)
                except tableschema.exceptions.CastError:
                    value = json.loads(value)
                # http://pandas.pydata.org/pandas-docs/stable/gotchas.html#support-for-integer-na
                if value is None and field.type in ('number', 'integer'):
                    jtstypes_map[field.name] = 'number'
                    value = np.NaN
                if field.name == primary_key:
                    index = value
                else:
                    values.append(value)
            data_rows.append(tuple(values))
            index_rows.append(index)

        # Get dtypes
        dtypes = []
        for field in schema.fields:
            if field.name != primary_key:
                field_name = field.name
                if six.PY2:
                    field_name = field.name.encode('utf-8')
                dtype = self.convert_type(jtstypes_map.get(field.name, field.type))
                dtypes.append((field_name, dtype))

        # Create dataframe
        index = None
        columns = schema.headers
        array = np.array(data_rows, dtype=dtypes)
        if primary_key:
            index_field = schema.get_field(primary_key)
            index_dtype = self.convert_type(index_field.type)
            index_class = pd.Index
            if index_field.type in ['datetime', 'date']:
                index_class = pd.DatetimeIndex
            index = index_class(index_rows, name=primary_key, dtype=index_dtype)
            columns = filter(lambda column: column != primary_key, schema.headers)
        dataframe = pd.DataFrame(array, index=index, columns=columns)

        return dataframe

Exemplo n.º 21

0

Exibir arquivo

Arquivo: inspector.py Projeto: sandeepgainda/goodtables-py

 def __infer(self, sample, headers=None):
     if headers is None:
         headers = [None] * len(sample[0]) if sample else []
     schema = tableschema.Schema()
     schema.infer(sample, headers=headers)
     return schema

Exemplo n.º 22

0

Exibir arquivo

import unittest
import tableschema

from etl.helpers import table_schema

TEST_SCHEMA1: tableschema.Schema = tableschema.Schema({
    "fields": [{
        "name": "field1"
    }, {
        "name": "field2",
        "milestones": [0]
    }],
    "column_based_milestone_names": ["Intake", "Exit", "NinetyDays"],
})

TEST_SCHEMA2: tableschema.Schema = tableschema.Schema({
    "fields": [
        {
            "name": "field1"
        },
        {
            "name": "field2",
            "milestones": [0]
        },
        {
            "name": "field3",
            "milestones": [0, 1]
        },
        {
            "name": "field4",
            "milestones": [2],

Exemplo n.º 23

0

Exibir arquivo

import unittest
import typing
import numpy as np
import pandas as pd
import tableschema

from etl.helpers import common, dataset_shape

MEMBER_ORGANIZATION_ID = "sample_id"

TEST_SCHEMA: tableschema.Schema = tableschema.Schema(
    {
        "fields": [
            {"name": "field1"},
            {"name": "field2"},
            {"name": "field3", "allows_multiple": True},
        ]
    }
)

TEST_COLUMN_MAPPING: typing.Dict[str, str] = {
    "internal_column_name1": "field1",
    "internal_column_name2": "field2",
    "internal_column_name3": "field3",
}

TEST_SCHEMA_COL: tableschema.Schema = tableschema.Schema(
    {
        "fields": [
            {"name": "field1"},
            {"name": "field2", "milestones": [0]},

Exemplo n.º 24

0

Exibir arquivo

    def convert_descriptor(self,
                           bucket,
                           descriptor,
                           index_fields=[],
                           autoincrement=None):
        """Convert descriptor to SQL
        """

        # Prepare
        columns = []
        indexes = []
        fallbacks = []
        constraints = []
        column_mapping = {}
        table_name = self.convert_bucket(bucket)
        comment = _get_comment(descriptor.get('title', ''),
                               descriptor.get('description', ''))
        schema = tableschema.Schema(descriptor)

        # Autoincrement
        if autoincrement is not None:
            columns.append(
                sa.Column(autoincrement,
                          sa.Integer,
                          autoincrement=True,
                          nullable=False))

        # Fields
        for field in schema.fields:
            column_type = self.convert_type(field.type)
            if not column_type:
                column_type = sa.Text
                fallbacks.append(field.name)
            nullable = not field.required
            comment = _get_field_comment(field)
            unique = field.constraints.get('unique', False)
            checks = []
            for name, value in field.constraints.items():
                if name == 'minLength':
                    checks.append(
                        Check('LENGTH("%s") >= %s' % (field.name, value)))
                elif name == 'maxLength':
                    checks.append(
                        Check('LENGTH("%s") <= %s' % (field.name, value)))
                elif name == 'minimum':
                    checks.append(Check('"%s" >= %s' % (field.name, value)))
                elif name == 'maximum':
                    checks.append(Check('"%s" <= %s' % (field.name, value)))
                elif name == 'pattern':
                    if self.__dialect in ['postgresql']:
                        checks.append(
                            Check('"%s" ~ \'%s\'' % (field.name, value)))
                    else:
                        checks.append(
                            Check('"%s" REGEXP \'%s\'' % (field.name, value)))
                elif name == 'enum':
                    column_type = sa.Enum(*value,
                                          name='%s_%s_enum' %
                                          (table_name, field.name))
            column = sa.Column(*([field.name, column_type] + checks),
                               nullable=nullable,
                               comment=comment,
                               unique=unique)
            columns.append(column)
            column_mapping[field.name] = column

        # Primary key
        pk = descriptor.get('primaryKey', None)
        if pk is not None:
            if isinstance(pk, six.string_types):
                pk = [pk]
        if autoincrement is not None:
            if pk is not None:
                pk = [autoincrement] + pk
            else:
                pk = [autoincrement]
        if pk is not None:
            constraint = sa.PrimaryKeyConstraint(*pk)
            constraints.append(constraint)

        # Foreign keys
        if self.__dialect in ['postgresql', 'sqlite']:
            fks = descriptor.get('foreignKeys', [])
            for fk in fks:
                fields = fk['fields']
                resource = fk['reference']['resource']
                foreign_fields = fk['reference']['fields']
                if isinstance(fields, six.string_types):
                    fields = [fields]
                if resource != '':
                    table_name = self.convert_bucket(resource)
                if isinstance(foreign_fields, six.string_types):
                    foreign_fields = [foreign_fields]
                composer = lambda field: '.'.join([table_name, field])
                foreign_fields = list(map(composer, foreign_fields))
                constraint = sa.ForeignKeyConstraint(fields, foreign_fields)
                constraints.append(constraint)

        # Indexes
        if self.__dialect == 'postgresql':
            for index, index_definition in enumerate(index_fields):
                name = table_name + '_ix%03d' % index
                index_columns = [
                    column_mapping[field] for field in index_definition
                ]
                indexes.append(sa.Index(name, *index_columns))

        return columns, constraints, indexes, fallbacks, comment

Exemplo n.º 25

0

Exibir arquivo

 def test_validate_invalid_milestone(self):
     bad_schema = tableschema.Schema(TEST_SCHEMA2.descriptor)
     bad_schema.descriptor["fields"][4]["milestones"] = [10]
     bad_schema.commit()
     self.assertFalse(table_schema.validate_schema(bad_schema))

Exemplo n.º 26

0

Exibir arquivo

 def test_validate_numeric_enum_missing_value(self):
     bad_schema = tableschema.Schema(TEST_SCHEMA2.descriptor)
     bad_schema.descriptor["fields"][4]["constraints"]["maximum"] = 3
     bad_schema.commit()
     self.assertFalse(table_schema.validate_schema(bad_schema))

Exemplo n.º 27

0

Exibir arquivo

 def test_validate_numeric_enum_not_int(self):
     bad_schema = tableschema.Schema(TEST_SCHEMA2.descriptor)
     bad_schema.descriptor["fields"][4]["type"] = "string"
     bad_schema.commit()
     self.assertFalse(table_schema.validate_schema(bad_schema))

Exemplo n.º 28

0

Exibir arquivo

    def convert_descriptor(self,
                           bucket,
                           descriptor,
                           index_fields=[],
                           autoincrement=None):
        """Convert descriptor to SQL
        """

        # Prepare
        columns = []
        indexes = []
        fallbacks = []
        constraints = []
        column_mapping = {}
        table_name = self.convert_bucket(bucket)
        schema = tableschema.Schema(descriptor)

        # Autoincrement
        if autoincrement is not None:
            columns.append(
                sa.Column(autoincrement,
                          sa.Integer,
                          autoincrement=True,
                          nullable=False))

        # Fields
        for field in schema.fields:
            column_type = self.convert_type(field.type)
            if not column_type:
                column_type = sa.Text
                fallbacks.append(field.name)
            nullable = not field.required
            column = sa.Column(field.name, column_type, nullable=nullable)
            columns.append(column)
            column_mapping[field.name] = column

        # Primary key
        pk = descriptor.get('primaryKey', None)
        if pk is not None:
            if isinstance(pk, six.string_types):
                pk = [pk]
        if autoincrement is not None:
            if pk is not None:
                pk = [autoincrement] + pk
            else:
                pk = [autoincrement]
        if pk is not None:
            constraint = sa.PrimaryKeyConstraint(*pk)
            constraints.append(constraint)

        # Foreign keys
        if self.__dialect == 'postgresql':
            fks = descriptor.get('foreignKeys', [])
            for fk in fks:
                fields = fk['fields']
                resource = fk['reference']['resource']
                foreign_fields = fk['reference']['fields']
                if isinstance(fields, six.string_types):
                    fields = [fields]
                if resource != '':
                    table_name = self.convert_bucket(resource)
                if isinstance(foreign_fields, six.string_types):
                    foreign_fields = [foreign_fields]
                composer = lambda field: '.'.join([table_name, field])
                foreign_fields = list(map(composer, foreign_fields))
                constraint = sa.ForeignKeyConstraint(fields, foreign_fields)
                constraints.append(constraint)

        # Indexes
        if self.__dialect == 'postgresql':
            for index, index_definition in enumerate(index_fields):
                name = table_name + '_ix%03d' % index
                index_columns = [
                    column_mapping[field] for field in index_definition
                ]
                indexes.append(sa.Index(name, *index_columns))

        return (columns, constraints, indexes, fallbacks)

Exemplo n.º 29

0

Exibir arquivo

Arquivo: mapper.py Projeto: decentral1se/tableschema-pandas-py

    def convert_descriptor_and_rows(self, descriptor, rows):
        """Convert descriptor and rows to Pandas
        """
        schema = tableschema.Schema(descriptor)

        # Get data/index
        data_rows = []
        index_rows = []
        jtstypes_map = {}
        for row in rows:
            data_values = []
            index_values = []
            for field, value in zip(schema.fields, row):
                try:
                    if isinstance(value, float) and np.isnan(value):
                        value = None
                    if value and field.type == 'integer':
                        value = int(value)
                    value = field.cast_value(value)
                except tableschema.exceptions.CastError:
                    value = json.loads(value)
                # http://pandas.pydata.org/pandas-docs/stable/gotchas.html#support-for-integer-na
                if value is None and field.type in ('number', 'integer'):
                    jtstypes_map[field.name] = 'number'
                    value = np.NaN
                if field.name in schema.primary_key:
                    index_values.append(value)
                else:
                    data_values.append(value)
            if len(schema.primary_key) == 1:
                index_rows.append(index_values[0])
            elif len(schema.primary_key) > 1:
                index_rows.append(tuple(index_values))
            data_rows.append(tuple(data_values))

        # Create index
        index = None
        if schema.primary_key:
            if len(schema.primary_key) == 1:
                index_class = pd.Index
                index_field = schema.get_field(schema.primary_key[0])
                index_dtype = self.convert_type(index_field.type)
                if field.type in ['datetime', 'date']:
                    index_class = pd.DatetimeIndex
                index = index_class(index_rows,
                                    name=index_field.name,
                                    dtype=index_dtype)
            elif len(schema.primary_key) > 1:
                index = pd.MultiIndex.from_tuples(index_rows,
                                                  names=schema.primary_key)

        # Create dtypes/columns
        dtypes = []
        columns = []
        for field in schema.fields:
            if field.name not in schema.primary_key:
                field_name = field.name
                if six.PY2:
                    field_name = field.name.encode('utf-8')
                dtype = self.convert_type(
                    jtstypes_map.get(field.name, field.type))
                dtypes.append((field_name, dtype))
                columns.append(field.name)

        # Create dataframe
        array = np.array(data_rows, dtype=dtypes)
        dataframe = pd.DataFrame(array, index=index, columns=columns)

        return dataframe