示例#1
0
def create_dpkg(top_level_dict, ev_ob_dict, directory, joint_id):
    """Create the datapackage representation.

    Keyword arguments:
    top_level_dict -- the dictionary with the TOP_LEVEL_INFO
    ev_ob_dict -- the dictionary containing events and objects
    directory -- the directory
    joint_id -- the joint_identifier
    """

    myDP = dp.DataPackage()

    for k, v in top_level_dict.items():
        myDP.descriptor[k] = v

    myDP.descriptor['resources'] = []

    # the events block #
    key = 'events'
    events_table = ev_ob_dict.get(key)
    path = key + '.csv'
    with io.open(directory + os.sep + key + '.csv') as stream:
        headers = stream.readline().rstrip('\n').split(',')
        values = csv.reader(stream)
        schema = infer(headers, values, row_limit=50, primary_key=joint_id)
        referenced_resource = key + 'Table'

    myDP.descriptor['resources'].append({
        "name": key + 'Table',
        "path": path,
        "schema": schema,
    })

    # the objects block #
    key = 'objects'
    objects_table = ev_ob_dict.get(key)
    path = key + '.csv'
    with io.open(directory + os.sep + key + '.csv') as stream:
        headers = stream.readline().rstrip('\n').split(',')
        values = csv.reader(stream)
        schema = infer(headers, values, row_limit=50)
        schema['foreignKeys'] = [{
            "fields": joint_id,
            "reference": {
                "datapackage": "",
                "resource": referenced_resource,
                "fields": joint_id
            }
        }]

    myDP.descriptor['resources'].append({
        "name": key + 'Table',
        "path": path,
        "schema": schema,
    })

    return myDP
def create_datapackage(ds):
    # Create datapackage based on dataset.json
    dp = datapackage.DataPackage()
    basepath = '{0}/{1}/{2}'.format(DIR,private_or_public(ds),ds['name'])
    dp.metadata['name'] = ds['name']
    dp.metadata['title'] = ds['label']
    dp.metadata['description'] = ds['description']
    if ds['territories']:
        dp.metadata['countryCode'] = ds['territories']
    dp.metadata['profiles'] = {'fiscal': '*','tabular': '*'}
    dp.metadata['resources'] = [{}]
    resource = dp.resources[0]
    resource.metadata['name'] = 'dataset'
    resource.metadata['path'] = 'dataset.csv'
    
    # Infer schema of dataset.csv file
    with io.open(basepath + '/dataset.csv') as stream:
        headers = stream.readline().rstrip('\n').split(',')
        values = csv.reader(stream)
        schema = infer(headers, values, row_limit=1000)
        resource.metadata['schema'] = schema

    # Translate mapping
    dp.metadata['mapping'] = transform_dataset(ds)
    return dp
示例#3
0
    def pre_run(self, data_table):

        if (self.schema is None) and self.infer_schema:
            sample_values = data_table.get_sample(300)
            self.schema = self.schema_model(jsontableschema.infer(data_table.headers, sample_values))

        return True, data_table
示例#4
0
def infer(data, row_limit, encoding, to_file):

    """Infer a schema from data.

    * data must be a local filepath
    * data must be CSV
    * the file encoding is assumed to be UTF-8 unless an encoding is passed
      with --encoding
    * the first line of data must be headers
    * these constraints are just for the CLI

    """

    if not row_limit:
        row_limit = None

    with io.open(data, mode='r+t', encoding=encoding) as stream:
        try:
            headers = stream.readline().rstrip('\n').split(',')
            values = jsontableschema.compat.csv_reader(stream)
        except UnicodeDecodeError:
            response = "Could not decode the data file as {0}. " \
                "Please specify an encoding to use with the " \
                "--encoding argument.".format(encoding)
        else:
            response = jsontableschema.infer(headers, values,
                                             row_limit=row_limit)

        if to_file:
            with io.open(to_file, mode='w+t', encoding='utf-8') as dest:
                dest.write(json.dumps(response, ensure_ascii=False, indent=2))

    click.echo(response)
示例#5
0
def infer(data, row_limit, to_file):

    """Infer a schema from data.

    * data must be a local filepath
    * data must be CSV
    * data must be UTF-8 encoded
    * the first line of data must be headers
    * these constraints are just for the CLI

    """

    if not row_limit:
        row_limit = None

    with io.open(data, mode='r+t', encoding='utf-8') as stream:
        headers = stream.readline().rstrip('\n').split(',')
        values = csv.reader(stream)
        response = jsontableschema.infer(headers, values, row_limit=row_limit)

    if to_file:
        with io.open(to_file, mode='w+t', encoding='utf-8') as dest:
            dest.write(json.dumps(response, ensure_ascii=False, indent=2))

    click.echo(response)
def infer_csv(csv_file, outfile, row_limit=0):
    with io.open(outfile, 'w') as fp:
        with io.open(csv_file) as stream:
            headers = stream.readline().rstrip('\n').split(',')
            values = jsontableschema.compat.csv_reader(stream)
            schema = jsontableschema.infer(headers, values, row_limit=row_limit)
            fp.write(six.u(json.dumps(schema, indent=2, ensure_ascii=False)))
示例#7
0
def infer(data, row_limit, encoding, to_file):
    """Infer a schema from data.

    * data must be a local filepath
    * data must be CSV
    * the file encoding is assumed to be UTF-8 unless an encoding is passed
      with --encoding
    * the first line of data must be headers
    * these constraints are just for the CLI

    """

    if not row_limit:
        row_limit = None

    with io.open(data, mode='r+t', encoding=encoding) as stream:
        try:
            headers = stream.readline().rstrip('\n').split(',')
            values = jsontableschema.compat.csv_reader(stream)
        except UnicodeDecodeError:
            response = "Could not decode the data file as {0}. " \
                "Please specify an encoding to use with the " \
                "--encoding argument.".format(encoding)
        else:
            response = jsontableschema.infer(headers,
                                             values,
                                             row_limit=row_limit)

        if to_file:
            with io.open(to_file, mode='w+t', encoding='utf-8') as dest:
                dest.write(json.dumps(response, ensure_ascii=False, indent=2))

    click.echo(response)
示例#8
0
def infer_from_df(df, **kwargs):
    # df.iterrows does not preserve types
    h = df.head()
    fields = list(df)
    iterrows = ([str(h[_].values[i]) for _ in fields]
                for i in range(h.shape[0]))
    return infer(fields, iterrows, **kwargs)
    def test_infer_explicit_true(self):
        filepath = os.path.join(self.data_dir, 'data_infer.csv')
        with io.open(filepath) as stream:
            headers = stream.readline().rstrip('\n').split(',')
            values = jsontableschema.compat.csv_reader(stream)
            schema = jsontableschema.infer(headers, values, explicit=True)

        self.assertTrue(schema['fields'][0].get('constraints'))
示例#10
0
    def test_infer_explicit_true(self):
        filepath = os.path.join(self.data_dir, 'data_infer.csv')
        with io.open(filepath) as stream:
            headers = stream.readline().rstrip('\n').split(',')
            values = jsontableschema.compat.csv_reader(stream)
            schema = jsontableschema.infer(headers, values, explicit=True)

        self.assertTrue(schema['fields'][0].get('constraints'))
示例#11
0
    def pre_run(self, data_table):

        sample_values = data_table.get_sample(300)

        if (self.schema is None) and self.infer_schema:
            self.schema = self.schema_model(jsontableschema.infer(data_table.headers, sample_values))

        if self.schema and self.process_extra_fields:
            self.extra_fields = (set(data_table.headers)).difference(set(self.schema.headers))
            infered_schema = jsontableschema.infer(data_table.headers, sample_values)
            complete_schema_dict = self.schema._to_python()

            for field in infered_schema['fields']:
                if field['name'] in self.extra_fields:
                    complete_schema_dict['fields'].append(copy.deepcopy(field))

            self.schema = self.schema_model(complete_schema_dict)

        return True, data_table
示例#12
0
    def test_infer_schema_primary_key_list(self):
        primary_key = ['id', 'age']
        filepath = os.path.join(self.data_dir, 'data_infer.csv')
        with io.open(filepath) as stream:
            headers = stream.readline().rstrip('\n').split(',')
            values = jsontableschema.compat.csv_reader(stream)
            schema = jsontableschema.infer(headers, values, primary_key=primary_key)
        schema_model = jsontableschema.models.SchemaModel(schema)

        self.assertTrue(schema_model.primaryKey, primary_key)
    def test_infer_schema_primary_key_list(self):
        primary_key = ['id', 'age']
        filepath = os.path.join(self.data_dir, 'data_infer.csv')
        with io.open(filepath) as stream:
            headers = stream.readline().rstrip('\n').split(',')
            values = jsontableschema.compat.csv_reader(stream)
            schema = jsontableschema.infer(headers, values, primary_key=primary_key)
        schema_model = jsontableschema.model.SchemaModel(schema)

        self.assertTrue(schema_model.primaryKey, primary_key)
示例#14
0
    def test_infer_schema_row_limit(self):
        filepath = os.path.join(self.data_dir, 'data_infer_row_limit.csv')
        with io.open(filepath) as stream:
            headers = stream.readline().rstrip('\n').split(',')
            values = jsontableschema.compat.csv_reader(stream)
            schema = jsontableschema.infer(headers, values, row_limit=4)
        schema_model = jsontableschema.models.SchemaModel(schema)

        self.assertEqual(schema_model.get_field('id')['type'], 'integer')
        self.assertEqual(schema_model.get_field('age')['type'], 'integer')
        self.assertEqual(schema_model.get_field('name')['type'], 'string')
    def test_infer_schema_row_limit(self):
        filepath = os.path.join(self.data_dir, 'data_infer_row_limit.csv')
        with io.open(filepath) as stream:
            headers = stream.readline().rstrip('\n').split(',')
            values = jsontableschema.compat.csv_reader(stream)
            schema = jsontableschema.infer(headers, values, row_limit=4)
        schema_model = jsontableschema.model.SchemaModel(schema)

        self.assertEqual(schema_model.get_field('id')['type'], 'integer')
        self.assertEqual(schema_model.get_field('age')['type'], 'integer')
        self.assertEqual(schema_model.get_field('name')['type'], 'string')
示例#16
0
    def pre_run(self, data_table):

        sample_values = data_table.get_sample(300)

        if (self.schema is None) and self.infer_schema:
            self.schema = self.schema_model(
                jsontableschema.infer(data_table.headers, sample_values))

        if self.schema and self.process_extra_fields:
            self.extra_fields = (set(data_table.headers)).difference(
                set(self.schema.headers))
            infered_schema = jsontableschema.infer(data_table.headers,
                                                   sample_values)
            complete_schema_dict = self.schema._to_python()

            for field in infered_schema['fields']:
                if field['name'] in self.extra_fields:
                    complete_schema_dict['fields'].append(copy.deepcopy(field))

            self.schema = self.schema_model(complete_schema_dict)

        return True, data_table
示例#17
0
def package_yearly_data():
    csv_file_path = CSV_DATE_DIR + YEARLY_DATA_FILE_CSV
    pkg_file_path = PKG_DATE_DIR + YEARLY_DATA_FILE_PKG
    dp = datapackage.DataPackage()
    dp.descriptor['name'] = 'yearly-gas-price'
    dp.descriptor['title'] = 'Yearly Avg Gas Price'
    with io.open(csv_file_path) as stream:
        headers = stream.readline().rstrip('\n').split(',')
        values = csv.reader(stream)
        schema = infer(headers, values)
        dp.descriptor['resources'] = [
            {
                'name': 'data',
                'path': csv_file_path,
                'schema': schema
            }
        ]
    with open(pkg_file_path, 'w') as f:
        f.write(dp.to_json())
示例#18
0
def extra_header(errors, columns, sample, infer_fields=False):
    for column in copy(columns):
        if 'field' not in column:
            # Infer field
            if infer_fields:
                column_sample = []
                for row in sample:
                    value = None
                    if len(row) > column['number']:
                        value = row[column['number']]
                    column_sample.append(value)
                descriptor = infer([column['header']], column_sample)
                column['field'] = Schema(descriptor).fields[0]
            # Add error/remove column
            else:
                message = spec['errors']['extra-header']['message']
                message = message.format(column_number=column['number'])
                errors.append({
                    'code': 'extra-header',
                    'message': message,
                    'row-number': None,
                    'column-number': column['number'],
                })
                columns.remove(column)
示例#19
0
    def __inspect_table(self, table):

        # Start timer
        start = datetime.datetime.now()

        # Prepare vars
        errors = []
        headers = None
        row_number = 0
        fatal_error = False
        checks = copy(self.__checks)
        source = table['source']
        stream = table['stream']
        schema = table['schema']
        extra = table['extra']

        # Prepare table
        try:
            stream.open()
            sample = stream.sample
            headers = stream.headers
            if self.__filter_checks(checks, type='schema'):
                if schema is None and self.__infer_schema:
                    schema = Schema(infer(headers, sample))
            if schema is None:
                checks = self.__filter_checks(checks,
                                              type='schema',
                                              inverse=True)
        except Exception as exception:
            fatal_error = True
            message = str(exception)
            if isinstance(exception, tabulator.exceptions.SourceError):
                code = 'source-error'
            elif isinstance(exception, tabulator.exceptions.SchemeError):
                code = 'scheme-error'
            elif isinstance(exception, tabulator.exceptions.FormatError):
                code = 'format-error'
            elif isinstance(exception, tabulator.exceptions.EncodingError):
                code = 'encoding-error'
            elif isinstance(exception, tabulator.exceptions.IOError):
                code = 'io-error'
            elif isinstance(exception, tabulator.exceptions.HTTPError):
                code = 'http-error'
            else:
                raise
            errors.append({
                'row': None,
                'code': code,
                'message': message,
                'row-number': None,
                'column-number': None,
            })

        # Prepare columns
        if not fatal_error:
            columns = []
            fields = [None] * len(headers)
            if schema is not None:
                fields = schema.fields
            iterator = zip_longest(headers, fields, fillvalue=_FILLVALUE)
            for number, (header, field) in enumerate(iterator, start=1):
                column = {'number': number}
                if header is not _FILLVALUE:
                    column['header'] = header
                if field is not _FILLVALUE:
                    column['field'] = field
                columns.append(column)

        # Head checks
        if not fatal_error:
            head_checks = self.__filter_checks(checks, context='head')
            for check in head_checks:
                if not columns:
                    break
                check['func'](errors, columns, sample)
            for error in errors:
                error['row'] = None

        # Body checks
        if not fatal_error:
            states = {}
            colmap = {column['number']: column for column in columns}
            body_checks = self.__filter_checks(checks, context='body')
            with stream:
                for row_number, headers, row in stream.iter(extended=True):
                    columns = []
                    iterator = zip_longest(headers, row, fillvalue=_FILLVALUE)
                    for number, (header, value) in enumerate(iterator,
                                                             start=1):
                        colref = colmap.get(number, {})
                        column = {'number': number}
                        if header is not _FILLVALUE:
                            column['header'] = colref.get('header', header)
                        if 'field' in colref:
                            column['field'] = colref['field']
                        if value is not _FILLVALUE:
                            column['value'] = value
                        columns.append(column)
                    for check in body_checks:
                        if not columns:
                            break
                        state = states.setdefault(check['code'], {})
                        check['func'](errors, columns, row_number, state)
                    for error in reversed(errors):
                        if 'row' in error:
                            break
                        error['row'] = row
                    if row_number >= self.__row_limit:
                        break
                    if len(errors) >= self.__error_limit:
                        break

        # Stop timer
        stop = datetime.datetime.now()

        # Compose report
        errors = errors[:self.__error_limit]
        report = copy(extra)
        report.update({
            'time': round((stop - start).total_seconds(), 3),
            'valid': not bool(errors),
            'error-count': len(errors),
            'row-count': row_number,
            'headers': headers,
            'source': source,
            'errors': errors,
        })

        return report