예제 #1
0
    def getSchema(self):
        '''
        Get data schema infered from records,
            the number of records is defined by SCHEMA_INFER_LIMIT
            the confidence trashold is defined by SCHEMA_INFER_CONFIDENCE
        '''
        try:
            tmpCursor = self.getCursor()
            tmpCursor.execute(self.query_sql)
            i = settings.SCHEMA_INFER_LIMIT
            tmpList = []
            row = tmpCursor.fetchone()
            while i > 0 and row is not None:
                tmpList.append(row)
                row = tmpCursor.fetchone()
                i -= 1
            tmpList = [self.getColumns()] + tmpList
            t = Table(tmpList)
            t.infer()
            t.schema.descriptor[
                'missingValues'] = settings.SCHEMA_DATABASE_MISSING_VALUES
            return t.infer(confidence=settings.SCHEMA_INFER_CONFIDENCE)

        except Exception as e:
            tmpCursor.close()
            raise e
예제 #2
0
    def _is_columns_dtype_same(self,
                               row_limit: int = 500,
                               confidence: float = 0.85):
        train_table = Table(self.train_csv_path)
        predict_table = Table(self.predict_csv_path)
        train_schema = train_table.infer(limit=row_limit,
                                         confidence=confidence)
        predict_schema = predict_table.infer(limit=row_limit,
                                             confidence=confidence)

        train_fields = train_schema['fields']
        predict_fields = predict_schema['fields']

        train_col_names = [d['name'] for d in train_fields]
        predict_col_names = [d['name'] for d in predict_fields]

        num_col_diff = 0
        for predict_dict in predict_fields:
            predict_dtype = predict_dict['type']
            predict_name = predict_dict['name']

            train_idx = train_col_names.index(predict_name)
            train_dict = train_fields[train_idx]
            train_dtype = predict_dict['type']

            if train_dtype != predict_dtype:
                raise Exception(
                    f'Column: {predict_name} have different data types between train and predict CSV'
                )
예제 #3
0
def test_schema_infer_storage(import_module, apply_defaults):
    import_module.return_value = Mock(Storage=Mock(return_value=Mock(
        describe=Mock(return_value=SCHEMA_MIN),
        iter=Mock(return_value=DATA_MIN[1:]),
    )))
    table = Table('table', storage='storage')
    table.infer()
    assert table.headers == ['key', 'value']
    assert table.schema.descriptor == apply_defaults(SCHEMA_MIN)
예제 #4
0
def test_schema_infer_missing_values():
    table = Table('data/data_infer_missing_values.csv')
    table.infer(missing_values=['-'])
    schema = deepcopy(SCHEMA_CSV)
    schema['missingValues'] = ['-']
    assert table.schema.descriptor == schema
    assert table.read() == [[1, 39, 'Paul'], [None, 25, 'Test'],
                            [2, 23, 'Jimmy'], [None, 25, 'Test'],
                            [3, 36, 'Jane'], [None, 25, 'Test'],
                            [4, 28, 'Judy']]
예제 #5
0
def test_read_storage(import_module):
    # Mocks
    import_module.return_value = Mock(Storage=Mock(return_value=Mock(
        describe=Mock(return_value=SCHEMA_MIN),
        iter=Mock(return_value=DATA_MIN[1:]),
    )))
    # Tests
    table = Table('table', storage='storage')
    table.infer()
    expect = [['one', 1], ['two', 2]]
    actual = table.read()
    assert actual == expect
    def infer(self, limit=None, confidence=.75):
        """
        Infer the current type by looking at the values in the table
         """
        # Do initial infer tqo set up headers and schema.
        Table.infer(self)

        rows = self.read(cast=False)
        headers = self.headers
        # Get descriptor
        fields = []
        type_matches = {}
        for header in headers:
            fields.append({'name': header})

        rindex = 0
        for rindex, row in enumerate(rows):
            if limit is not None and rindex == limit:
                break
            # build a column-wise lookup of type matches
            for cindex, value in enumerate(row):
                typeid = self.__get_type(value, type_matches.get(cindex, None))
                type_matches[cindex] = typeid
        self.row_count = rindex
        url_type = type(urlparse('foo'))

        for index, results in type_matches.items():
            type_name, type_format = None, 'default'
            if results is bool:
                type_name = 'boolean'
            elif results is int:
                type_name = 'integer'
            elif results is float:
                type_name = 'number'
            elif results is str:
                type_name = 'string'
            elif results is datetime.datetime:
                type_name = 'datetime'
                type_format = 'any'
            elif results is url_type:
                type_name = 'string'
                type_format = 'uri'
            else:
                raise DerivaCSVError(msg='Bad type in infer')

            fields[index].update({'type': type_name, 'format': type_format})
        # Now update the schema to have the inferred values.
        self.schema.descriptor['fields'] = fields

        # Reset the key constraints as they were blasted away by the infer.
        self.__set_key_constraints()
        self.schema.commit()
        return
예제 #7
0
def task5():
    table = Table('movies.csv')
    table.infer()

    # fixing fields
    # IDs should be strings
    table.schema.descriptor['fields'][0]['type'] = 'string'
    table.schema.descriptor['fields'][1]['type'] = 'string'
    table.schema.commit()
    print(table.schema.valid)

    table.schema.save('schema.json')
예제 #8
0
def infer(path, limit=2000):
    table = Table(path)
    table.infer(limit=limit, confidence=0.75)

    data = pd.read_csv(path, low_memory=False)
    num_rows = data.index._stop

    rows_to_scan = limit if limit < num_rows else num_rows

    metadata_array = []
    for field in table.schema.fields:
        metadata = Metadata()

        metadata.name = field.name
        metadata.type = field.type
        metadata.format = field.format

        object_description = data[field.name].astype(object).describe()

        missing_count = num_rows - int(object_description['count'])
        metadata.missing_count = missing_count
        metadata.missing_percentage = round(
            float(missing_count) / num_rows * 100, 2)

        distinct_count = int(object_description['unique'])
        metadata.distinct_count = distinct_count
        metadata.distinct_percentage = round(
            float(distinct_count) / (num_rows - missing_count) * 100, 2)

        metadata.most_frequent = object_description['top']

        if metadata.type == "string" and metadata.missing_percentage != 100.0:
            if rows_to_scan == data[metadata.name].head(rows_to_scan).apply(
                    lambda x: is_date(x)).sum():
                metadata.type = "date"

        if (metadata.type == "integer" or metadata.type
                == "number") and (data.dtypes[field.name] == "int64"
                                  or data.dtypes[field.name] == "float64"):
            numeric_description = data[field.name].describe()
            metadata.min = numeric_description['min']
            metadata.max = numeric_description['max']

        metadata.ml_type = metadata.type
        if metadata.type == "integer" or metadata.type == "number":
            metadata.ml_type = "numeric"
        if metadata.type == "string":
            metadata.ml_type = "open_text"

        metadata_array.append(metadata)

    return metadata_array
예제 #9
0
def test_read_storage_passed_as_instance():
    # Mocks
    storage = Mock(
        describe=Mock(return_value=SCHEMA_MIN),
        iter=Mock(return_value=DATA_MIN[1:]),
        spec=Storage,
    )
    # Tests
    table = Table('table', storage=storage)
    table.infer()
    expect = [['one', 1], ['two', 2]]
    actual = table.read()
    assert actual == expect
예제 #10
0
 def getSchema(self):
     '''
     Get data schema infered from records,
         the number of records is defined by SCHEMA_INFER_LIMIT
         the confidence trashold is defined by SCHEMA_INFER_CONFIDENCE
     '''
     t = Table(self._filePath)
     t.infer()
     t.schema.descriptor[
         'missingValues'] = settings.SCHEMA_CSV_MISSING_VALUES
     t.schema.commit()
     return t.infer(
         settings.SCHEMA_INFER_LIMIT,
         confidence=settings.SCHEMA_INFER_CONFIDENCE)
예제 #11
0
def test_processors():
    # Processor
    def skip_under_30(erows):
        for row_number, headers, row in erows:
            krow = dict(zip(headers, row))
            if krow['age'] >= 30:
                yield (row_number, headers, row)

    # Create table
    table = Table('data/data_infer.csv', post_cast=[skip_under_30])
    table.infer()
    expect = [[1, 39, 'Paul'], [3, 36, 'Jane']]
    actual = table.read()
    assert actual == expect
예제 #12
0
def test_infer_schema_empty_file():
    s = Table('data/empty.csv')
    d = s.infer()
    assert d == {
        'fields': [],
        'missingValues': [''],
    }
예제 #13
0
class TableLayoutOrder:
    '''
    Find the table using the order of the datatypes
    '''
    def __init__(self, meta_field_path, filepath):
        self.meta_field_path = meta_field_path
        self.filepath = filepath
        self.table = Table(self.filepath)
        self.meta_field = pd.read_csv(self.meta_field_path)

    def get_input_array(self):
        input_table_arr = []
        for i, value in enumerate(self.table.infer()['fields']):
            val = str(value['type'] + str(i + 1))
            input_table_arr.append(val)
        return input_table_arr

    def check_name(self):
        print(self.filepath)
        info = TableInformation(self.filepath)
        deli = info.show_delimeter()
        with open(self.filepath, ) as f:
            reader = csv.reader(f, delimiter=deli)
            line = next(reader)
        unique_tables = self.meta_field['TableName'].unique()
        print('sss', unique_tables)
        array_name = []
        for i in unique_tables:
            temp = self.meta_field.loc[self.meta_field["TableName"] == i]
            if np.array_equal(sorted(line),
                              sorted(temp["fields"].values)) == True:
                array_name.append(i)
            # print(temp["fields"].values)
        return array_name

    def check_ordered(self, check_tables=[]):
        if not check_tables:
            #unique_tables = check_tables
            #unique_tables = list(set(unique_tables) & set(check_tables))
            unique_tables = self.check_name()

        else:
            print(self.check_name())
            unique_tables = self.check_name()

        probable = []
        print(unique_tables)
        for i in unique_tables:
            unique_table_arr = []
            temp = self.meta_field.loc[self.meta_field['TableName'] ==
                                       i].copy()
            for index, row in temp.iterrows():
                unique_table_arr.append(
                    str(row['datatype']) + str(row['order']))

            if np.array_equal(sorted(self.get_input_array()),
                              sorted(unique_table_arr)) == True:
                probable.append(i)
        return probable
예제 #14
0
def get_table_schema_fields(table: Table, max_rows: int) -> List[SchemaField]:
    table.infer(limit=max_rows)

    fields: List[SchemaField] = []

    for raw_field in table.schema.fields:
        mapped_type: Type = tableschema_type_map.get(raw_field.type,
                                                     NullTypeClass)

        field = SchemaField(
            fieldPath=raw_field.name,
            type=SchemaFieldDataType(mapped_type()),
            nativeDataType=str(raw_field.type),
            recursive=False,
        )
        fields.append(field)

    return fields
예제 #15
0
 def reindex(self):
     file_path, params = self.get_file_info()
     t = Table(file_path, ignore_blank_headers=True, **params)
     schema = t.infer()
     data = t.iter(keyed=True)
     self.storage.create(self.index_name,
                         schema,
                         reindex=True,
                         always_recreate=True)
     self.storage.write(self.index_name, data)
     for res in self.storage.write(self.index_name, data):
         pass
예제 #16
0
def task5(inputcsv,outputjson):
  ! pip install tableschema
  from tableschema import Table 
  table = Table(inputcsv)
  table.read(keyed=True)

  table.infer()
  table.schema.descriptor

  table.schema.descriptor["missingValues"] = "N/A"
  table.schema.commit()
  table.schema.valid
  table.schema.errors

  table.schema.descriptor["missingValues"] = "N/A"
  table.schema.commit()
  table.schema.valid
  table.schema.errors

  table.read(keyed=True)
  table.schema.save(outputjson)
def geraSchema(csvFile, tabelName):
    from tableschema import Table
    import json
    # abre o CSV
    table = Table(csvFile)
    #, schema='schema.json')

    table.infer(limit=100000)
    json_data = table.schema.descriptor
    # print json_data.keys()
    sqlCommand = "DROP TABLE IF EXISTS " + tabelName + "; \n" + "CREATE TABLE " + tabelName + " ( " + "\n"
    t = len(json_data['fields'])
    cont = 0
    for campos in json_data['fields']:
        cont = cont + 1
        comma = ","
        if (cont == t):
            comma = " "
        typeData = campos['type']
        if (campos['name'] == "NU_INSCRICAO"):
            typeData = "varchar(12)"
        else:
            if (typeData == "string"):
                typeData = "varchar"
            else:
                if (typeData == "number"):
                    typeData = "varchar(20)"  # "numeric(7,2)"
                else:
                    if (typeData == "integer"):
                        typeData = "varchar(10)"
        sqlCommand = sqlCommand + "\t" + \
            campos['name'] + "\t" + typeData + comma + "\n"

    sqlCommand = sqlCommand + "); \t"
    # print sqlCommand
    return sqlCommand
예제 #18
0
class DataCleaner(object):
    """DataCleaner has a set of functions to automatically clean 
    tabular data. It does things like
    - infer data type (numeric, categorical, date, boolean)
    - infer missing values
    - automatically describe data in simple (lay terms)
    - automatically run some inference / correlation algos
    """
    def __init__(self, csv_file):
        self.fname_ = csv_file
        self.table_ = Table(self.fname_)

    def InferType(self):
        # First pass: use tableschema inference to get syntactic shema
        # syntactic schema: integer, float, string, date
        syn_schema = self.table_.infer()

        # convert to json
        syn_schema_json = json.dumps(syn_schema, indent=4)
        print("json:" + str(syn_schema_json))

        #
        # semantic schema for a column:
        #   numeric: integer, float, integer range, float range, date, date range
        #   categorical: boolean, enum
        #   string: person name, geo location, etc
        #
        # other attributes for a column: is_identifier, is_unique, missing_id, human_description
        #
        # Signals from name (i.e., things like "id" "number" "num" "count" "code" "name")
        # - numeric-ness: "num" etc
        # - categorical-ness
        # - id-ness: "id" "code"
        # - string-ness: "name"
        #
        # Signals from distribution (for syntactic types integer, float)
        # - basic stats: min, max, median, mode, etc
        # - gaussian fit?
        # - smoothness / monotonicity: bumpiness denotes categorical (code)
        #
        # Signals from distribution (for syntactic types integer, string, date)
        # - num_distinct_val
        # - num_distinct_val_for_90%
        # - boolean-ness: "Yes" "No"; 0/1; up/down, etc
        # - nameness
        # - camelcase: denotes categorical
        #
        return syn_schema
예제 #19
0
    def schema(self, use_headers=True):
        if not self.resource.file:
            raise ValidationError(_('File does not exist'))

        if self.resource.format not in ('csv', 'tsv', 'xls', 'xlsx', 'ods'):
            raise ValidationError(_('Invalid file type'))

        if not self._schema_cache:
            _schema = self.resource.tabular_data_schema or None
            if not _schema:
                _table = Table(self.resource.file.path,
                               ignore_blank_headers=True,
                               format=self.resource.format,
                               encoding=self.resource.file_encoding or 'utf-8')
                _schema = _table.infer(limit=5000)
            self._schema_cache = _schema

        return self._schema_cache
예제 #20
0
		# Define a DataSet Schema
		dsr = DataSetRequest()
		dsr.name = # PUT YOUR DATASET NAME LOGIC HERE
		dsr.description = # DATASET DESCRIPTION
		dsr.schema = Schema(jeff)

		# Create a DataSet with the given Schema
		dataset = datasets.create(dsr)
		domo.logger.info("Created DataSet " + dataset['id'])

		# Get a DataSets's metadata
		retrieved_dataset = datasets.get(dataset['id'])
		domo.logger.info("Retrieved DataSet " + retrieved_dataset['id'])

		# List DataSets
		dataset_list = list(datasets.list(sort=Sorting.NAME))
		domo.logger.info("Retrieved a list containing {} DataSet(s)".format(
                                                        len(dataset_list)))
		csv_file_path = allFiles
		datasets.data_import_from_file(dataset['id'], csv_file_path)
		domo.logger.info("Uploaded data from a file to DataSet {}".format(
			dataset['id']))


for eachCSV in os.listdir(inputdir):
	allFiles = str(os.path.abspath(eachCSV))
	table = Table(allFiles)
	table.infer()
	jeff = table.schema.descriptor
	table.read(keyed=True)
	directoryUpload()
예제 #21
0
from tableschema import Table

fileCSV = 'D:\dct\enem-microdados\DADOS_ENEM_2009.csv'
fileJSON = 'D:\dct\enem-microdados\DADOS_ENEM_2009-schema.json'

# Create table
table = Table(fileCSV)
table.infer(limit=100000)
# table.schema.descriptor
table.schema.save(fileJSON)
예제 #22
0
class CsvReader(DataReader):
    '''this is a wrapper class for tableschema library'''

    def __init__(self, filePath):
        self._filePath = filePath
        self._table = Table(filePath)

    def close(self):
        pass

    def getColumns(self):
        if not self._table.headers:
            self._table.infer(
                settings.SCHEMA_INFER_LIMIT)
        # clean the headers
        result = []
        for header in self._table.headers:
            tmpheader = header.lower()
            tmpheader = tmpheader.replace(' ', '_').replace('-', '_')
            r = re.search('\w+', tmpheader)
            if r:
                result.append(r.group())
            else:
                raise InvalidCsvHeaderException(
                    '%s is not a valid header' % header)
        return result

    def requery(self):
        self._table = Table(self._filePath)

    def getRow(self):
        i = self._table.iter(cast=True)
        return next(i)

    def getRowsList(self):
        self._table.infer()
        self._table.schema.descriptor[
            'missingValues'] = settings.SCHEMA_CSV_MISSING_VALUES
        self._table.schema.commit()
        i = self._table.iter(cast=True)

        return list(map(tuple, i))

    def getSchema(self):
        '''
        Get data schema infered from records,
            the number of records is defined by SCHEMA_INFER_LIMIT
            the confidence trashold is defined by SCHEMA_INFER_CONFIDENCE
        '''
        t = Table(self._filePath)
        t.infer()
        t.schema.descriptor[
            'missingValues'] = settings.SCHEMA_CSV_MISSING_VALUES
        t.schema.commit()
        return t.infer(
            settings.SCHEMA_INFER_LIMIT,
            confidence=settings.SCHEMA_INFER_CONFIDENCE)

    def __repr__(self):
        return (settings.SOURCE_TYPE_CSV_PREFIX +
                self._filePath)
예제 #23
0
 def _get_columns_names(self, file_path, **kwargs):
     table = Table(file_path, **kwargs)
     table.read(keyed=True, limit=1)
     return [field['name'] for field in table.infer()['fields']]
예제 #24
0
def test_schema_infer_tabulator():
    table = Table('data/data_infer.csv')
    table.infer()
    assert table.headers == ['id', 'age', 'name']
    assert table.schema.descriptor == SCHEMA_CSV
예제 #25
0
 def data(self):
     source = self.storage.read(self.index_name)
     t = Table(source, ignore_blank_headers=True)
     return (t.infer(), t.headers, t.read(keyed=False, limit=1000))
    def __init__(self,
                 source,
                 schema_name,
                 table_name=None,
                 column_map=True,
                 key_columns=None,
                 row_number_as_key=False,
                 schema=None):
        """

        :param source: File containing the table data
        :param schema_name: Name of the Deriva Schema in which this table will be located
        :param table_name: Name of the table.  If not provided, use the source file name
        :param column_map: a column name mapping dictionary, of the form [n1,n2,n3] or {n1:v, n2:v}.  In the list form
                           elements are the exact capatilization of words to be used in a name.  In the dictionary
                           form, the values are what the name should be replaced with.  All matching is done case
                           insensitive.  Word substitution is only done after column names are split. Other matches are
                           done both before and after the mapping of the name into snake case.

        :param key_columns: name of columns to use as keys (non-null, unique). Can be a single column or a list of
                            list of columns.  The first element of the list is used as the primary key.
        :param row_number_as_key: if key column is not provided, use the row number of the CSV in combination with a
                            upload ID generated by system to identify the row.
        :param schema: existing tableschema file to use instead of infering types
        """
        if schema is True:
            schema = None
        super(DerivaCSV, self).__init__(source, schema=schema)

        self.source = source
        self.table_name = table_name
        self._column_map = column_map
        self._key_columns = key_columns
        self.schema_name = schema_name
        self.row_count = None
        self.validation_report = None
        self.row_number_as_key = row_number_as_key if self._key_columns is None else False

        # Normalize the column map so we only have a dictionary.
        if self._column_map:
            if isinstance(self._column_map, list):
                # We have a word map.
                self._column_map = {i.upper(): i for i in self._column_map}
            elif isinstance(self._column_map, dict):
                self._column_map = {
                    k.upper(): v
                    for k, v in self._column_map.items()
                }
            else:
                self._column_map = {}

        # If tablename is not specified, use the file name of the data file as the table name.
        if not self.table_name:
            self.table_name = os.path.splitext(os.path.basename(source))[0]

        # Make the table name consistent with the naming strategy
        self.table_name = self.map_name(self.table_name)

        # Do initial infer to set up headers and schema.
        Table.infer(self)

        # Headers have to be unique
        if len(self.headers) != len(set(self.headers)):
            raise DerivaCSVError(msg='Duplicated column name in table')

        # Keys column can be a  single column or a list of a list of columns.  Normlize to list of lists...
        if self._key_columns is None or self._key_columns == []:
            self._key_columns = []
        elif not isinstance(self._key_columns, list):
            self._key_columns = [[self._key_columns]]
        else:
            self._key_columns = [
                i if type(i) is list else [i] for i in self._key_columns
            ]
        self.__set_key_constraints()

        return