Пример #1
0
 def __init__(self, csv_file):
     self.fname_ = csv_file
     self.table_ = Table(self.fname_)
Пример #2
0
 def __init__(self, meta_field_path, filepath):
     self.meta_field_path = meta_field_path
     self.filepath = filepath
     self.table = Table(self.filepath)
     self.meta_field = pd.read_csv(self.meta_field_path)
Пример #3
0
 def requery(self):
     self._table = Table(self._filePath)
Пример #4
0
def test_iter():
    table = Table(DATA_MIN, schema=SCHEMA_MIN)
    expect = [['one', 1], ['two', 2]]
    actual = list(table.iter())
Пример #5
0
from tableschema import Table

fileCSV = 'D:\dct\enem-microdados\DADOS_ENEM_2009.csv'
fileJSON = 'D:\dct\enem-microdados\DADOS_ENEM_2009-schema.json'

# Create table
table = Table(fileCSV)
table.infer(limit=100000)
# table.schema.descriptor
table.schema.save(fileJSON)
Пример #6
0
def test_schema_instance(apply_defaults):
    schema_instance = Schema(SCHEMA_MIN)
    actual = Table(DATA_MIN, schema=schema_instance).schema.descriptor
    expect = apply_defaults(SCHEMA_MIN)
    assert actual == expect
Пример #7
0
def test_schema_infer_tabulator():
    table = Table('data/data_infer.csv')
    table.infer()
    assert table.headers == ['id', 'age', 'name']
    assert table.schema.descriptor == SCHEMA_CSV
Пример #8
0
def test_size_remote():
    table = Table(BASE_URL % 'data/data.csv')
    table.read()
    assert table.size == SIZE
Пример #9
0
def test_size_not_read():
    table = Table(BASE_URL % 'data/data.csv')
    assert table.size is None
Пример #10
0
def test_size():
    table = Table('data/data.csv')
    table.read()
    assert table.size == SIZE
Пример #11
0
def test_size_compressed():
    table = Table('data/data.csv.zip')
    table.read()
    assert table.size == SIZE
Пример #12
0
def test_read_limit():
    table = Table(DATA_MIN, schema=SCHEMA_MIN)
    expect = [['one', 1]]
    actual = table.read(limit=1)
    assert actual == expect
Пример #13
0
def test_read_keyed():
    table = Table(DATA_MIN, schema=SCHEMA_MIN)
    expect = [{'key': 'one', 'value': 1}, {'key': 'two', 'value': 2}]
    actual = table.read(keyed=True)
    assert actual == expect
Пример #14
0
def test_iter_keyed():
    table = Table(DATA_MIN, schema=SCHEMA_MIN)
    expect = [{'key': 'one', 'value': 1}, {'key': 'two', 'value': 2}]
    actual = list(table.iter(keyed=True))
    assert actual == expect
Пример #15
0
def test_read_integrity_hash():
    table = Table('data/data.csv')
    table.read(integrity={'hash': HASH})
    assert True
Пример #16
0
def test_hash():
    table = Table('data/data.csv')
    table.read()
    assert table.hash == HASH
Пример #17
0
def test_read_integrity_hash_error():
    table = Table('data/data.csv')
    with pytest.raises(exceptions.IntegrityError) as excinfo:
        table.read(integrity={'hash': HASH + 'a'})
    assert HASH in str(excinfo.value)
Пример #18
0
def test_hash_compressed():
    table = Table('data/data.csv.zip')
    table.read()
    assert table.hash == HASH
Пример #19
0
def test_schema_descriptor(apply_defaults):
    actual = Table(DATA_MIN, schema=SCHEMA_MIN).schema.descriptor
    expect = apply_defaults(SCHEMA_MIN)
    assert actual == expect
Пример #20
0
def test_hash_remote():
    table = Table(BASE_URL % 'data/data.csv')
    table.read()
    assert table.hash == HASH
Пример #21
0
def test_iter_missing_cols_stream_closed():
    table = Table('data/data_missing_cols.csv', schema=SCHEMA_MIN)
    with pytest.raises(exceptions.CastError) as excinfo:
        for _ in table.iter():
            pass
    assert table._Table__stream.closed
Пример #22
0
def test_hash():
    table = Table(BASE_URL % 'data/data.csv')
    assert table.hash is None
Пример #23
0
def test_iter_csv():
    table = Table('data/data_infer.csv', schema=SCHEMA_CSV)
    expect = [[1, 39, 'Paul'], [2, 23, 'Jimmy'], [3, 36, 'Jane'],
              [4, 28, 'Judy']]
    actual = list(table.iter())
    assert actual == expect
Пример #24
0
def test_read_integrity():
    table = Table('data/data.csv')
    table.read(integrity={'size': SIZE, 'hash': HASH})
    assert True
Пример #25
0
    def create_table_from_csv(form, table):
        """Uploads a csv file and creates a superset datasource in Hive."""
        def convert_to_hive_type(col_type):
            """maps tableschema's types to hive types"""
            tableschema_to_hive_types = {
                'boolean': 'BOOLEAN',
                'integer': 'INT',
                'number': 'DOUBLE',
                'string': 'STRING',
            }
            return tableschema_to_hive_types.get(col_type, 'STRING')

        bucket_path = config['CSV_TO_HIVE_UPLOAD_S3_BUCKET']

        if not bucket_path:
            logging.info('No upload bucket specified')
            raise Exception(
                'No upload bucket specified. You can specify one in the config file.')

        table_name = form.name.data
        schema_name = form.schema.data

        if config.get('UPLOADED_CSV_HIVE_NAMESPACE'):
            if '.' in table_name or schema_name:
                raise Exception(
                    "You can't specify a namespace. "
                    'All tables will be uploaded to the `{}` namespace'.format(
                        config.get('HIVE_NAMESPACE')))
            full_table_name = '{}.{}'.format(
                config.get('UPLOADED_CSV_HIVE_NAMESPACE'), table_name)
        else:
            if '.' in table_name and schema_name:
                raise Exception(
                    "You can't specify a namespace both in the name of the table "
                    'and in the schema field. Please remove one')

            full_table_name = '{}.{}'.format(
                schema_name, table_name) if schema_name else table_name

        filename = form.csv_file.data.filename

        upload_prefix = config['CSV_TO_HIVE_UPLOAD_DIRECTORY']
        upload_path = config['UPLOAD_FOLDER'] + \
            secure_filename(filename)

        # Optional dependency
        from tableschema import Table  # pylint: disable=import-error
        hive_table_schema = Table(upload_path).infer()
        column_name_and_type = []
        for column_info in hive_table_schema['fields']:
            column_name_and_type.append(
                '`{}` {}'.format(
                    column_info['name'],
                    convert_to_hive_type(column_info['type'])))
        schema_definition = ', '.join(column_name_and_type)

        # Optional dependency
        import boto3  # pylint: disable=import-error

        s3 = boto3.client('s3')
        location = os.path.join('s3a://', bucket_path, upload_prefix, table_name)
        s3.upload_file(
            upload_path, bucket_path,
            os.path.join(upload_prefix, table_name, filename))
        sql = f"""CREATE TABLE {full_table_name} ( {schema_definition} )
            ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS
            TEXTFILE LOCATION '{location}'
            tblproperties ('skip.header.line.count'='1')"""
        logging.info(form.con.data)
        engine = create_engine(form.con.data.sqlalchemy_uri_decrypted)
        engine.execute(sql)
Пример #26
0
def test_read_integrity_size():
    table = Table('data/data.csv')
    table.read(integrity={'size': SIZE})
    assert True
Пример #27
0
 def __init__(self, filePath):
     self._filePath = filePath
     self._table = Table(filePath)
Пример #28
0
def test_read_integrity_size_error():
    table = Table('data/data.csv')
    with pytest.raises(exceptions.IntegrityError) as excinfo:
        table.read(integrity={'size': SIZE + 1})
    assert str(SIZE) in str(excinfo.value)
Пример #29
0
    def create_table_from_csv(cls, form, table):
        """Uploads a csv file and creates a superset datasource in Hive."""

        def convert_to_hive_type(col_type):
            """maps tableschema's types to hive types"""
            tableschema_to_hive_types = {
                "boolean": "BOOLEAN",
                "integer": "INT",
                "number": "DOUBLE",
                "string": "STRING",
            }
            return tableschema_to_hive_types.get(col_type, "STRING")

        bucket_path = config["CSV_TO_HIVE_UPLOAD_S3_BUCKET"]

        if not bucket_path:
            logging.info("No upload bucket specified")
            raise Exception(
                "No upload bucket specified. You can specify one in the config file."
            )

        table_name = form.name.data
        schema_name = form.schema.data

        if config.get("UPLOADED_CSV_HIVE_NAMESPACE"):
            if "." in table_name or schema_name:
                raise Exception(
                    "You can't specify a namespace. "
                    "All tables will be uploaded to the `{}` namespace".format(
                        config.get("HIVE_NAMESPACE")
                    )
                )
            full_table_name = "{}.{}".format(
                config.get("UPLOADED_CSV_HIVE_NAMESPACE"), table_name
            )
        else:
            if "." in table_name and schema_name:
                raise Exception(
                    "You can't specify a namespace both in the name of the table "
                    "and in the schema field. Please remove one"
                )

            full_table_name = (
                "{}.{}".format(schema_name, table_name) if schema_name else table_name
            )

        filename = form.csv_file.data.filename

        upload_prefix = config["CSV_TO_HIVE_UPLOAD_DIRECTORY"]
        upload_path = config["UPLOAD_FOLDER"] + secure_filename(filename)

        # Optional dependency
        from tableschema import Table  # pylint: disable=import-error

        hive_table_schema = Table(upload_path).infer()
        column_name_and_type = []
        for column_info in hive_table_schema["fields"]:
            column_name_and_type.append(
                "`{}` {}".format(
                    column_info["name"], convert_to_hive_type(column_info["type"])
                )
            )
        schema_definition = ", ".join(column_name_and_type)

        # Optional dependency
        import boto3  # pylint: disable=import-error

        s3 = boto3.client("s3")
        location = os.path.join("s3a://", bucket_path, upload_prefix, table_name)
        s3.upload_file(
            upload_path, bucket_path, os.path.join(upload_prefix, table_name, filename)
        )
        sql = f"""CREATE TABLE {full_table_name} ( {schema_definition} )
            ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS
            TEXTFILE LOCATION '{location}'
            tblproperties ('skip.header.line.count'='1')"""
        logging.info(form.con.data)
        engine = create_engine(form.con.data.sqlalchemy_uri_decrypted)
        engine.execute(sql)
Пример #30
0
            output_row['pseudo_align_70m_SD_mm'] = ''
            output_row['mean_top_70m_mm'] = input_row['mean_top_70m_mm']

            # unvalidated extension point for non-standard geometry items
            output_row['extended_items_geometry'] = json.dumps(
                {'curvature_mm': input_row['curvature_mm']})

            output_row['accel_z_wb_ms_2'] = input_row['accel_z_wb_ms_2']
            output_row['accel_x_wc_ms_2'] = input_row['accel_x_wc_ms_2']
            output_row['accel_x_wd_ms_2'] = input_row['accel_x_wd_ms_2']
            output_row['accel_y_wd_ms_2'] = input_row['accel_y_wd_ms_2']
            output_row['accel_y_wp_ms_2'] = input_row['accel_y_wp_ms_2']
            output_row['creating_adapter_version'] = ADAPTER_VERSION
            output_row['data_row_uid'] = uuid.uuid4()

            wr.writerow(output_row)

if args.schema is not None:
    # validate the output file against the schema
    # print(args.schema.name)
    tbl = Table(out_file.name, schema=args.schema.name)
    # print('checking...')
    try:
        tbl.read(limit=2000)
        print('OK')

    except exceptions.TableSchemaException as exception:
        for error in exception.errors:
            print(error)

time.sleep(5)