예제 #1
0
    def test_guessing_uses_first_in_case_of_tie(self):
        csv_file = StringIO.StringIO('''
            2
            1.1
            1500''')
        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(
            rows.sample, types=[DecimalType, IntegerType], strict=False)
        assert_equal(guessed_types, [DecimalType()])

        guessed_types = type_guess(
            rows.sample, types=[IntegerType, DecimalType], strict=False)
        assert_equal(guessed_types, [IntegerType()])
예제 #2
0
    def test_guessing_uses_first_in_case_of_tie(self):
        csv_file = StringIO.StringIO('''
            2
            1.1
            1500''')
        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(
            rows.sample, types=[DecimalType, IntegerType], strict=False)
        assert_equal(guessed_types, [DecimalType()])

        guessed_types = type_guess(
            rows.sample, types=[IntegerType, DecimalType], strict=False)
        assert_equal(guessed_types, [IntegerType()])
예제 #3
0
def analyze_csv(url, sample=1000):
    try:
        fileobj = urlopen(url)
        row_set = CSVRowSet('data', fileobj, window=sample)
        sample = list(row_set.sample)
        headers, sample = sample[0], sample[1:]
        #values = frequent_values(sample)
        types = type_guess(sample[500:], types=LIMITED_TYPES)
        mapping = {}
        for header, type_ in zip(headers, types):
            type_ = repr(type_).lower()
            name = slugify(header.value).lower()
            meta = {
                'label': header.value,
                'column': header.value,
                'datatype': type_
                }
            if type_ in ['decimal', 'integer', 'float']:
                meta['type'] = 'measure'
                meta['datatype'] = 'float'
            elif type_.startswith('date'):
                meta['type'] = 'date'
                meta['datatype'] = 'date'
            else:
                meta['type'] = 'attribute'
            mapping[name] = meta
        return {'columns': [h.value for h in headers], 
                'mapping': mapping}
    except Exception, e:
        return {'error': unicode(e)}
예제 #4
0
    def test_null_process(self):
        fh = horror_fobj('null.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        row_set.register_processor(null_processor(['null']))
        data = list(row_set)

        nones = [[x.value is None for x in row] for row in data]
        assert_equal(nones[0], [False, True, False, False])
        assert_equal(nones[1], [False, False, False, True])
        assert_equal(nones[2], [False, True, False, False])

        types = type_guess(row_set.sample, strict=True)
        expected_types = [
            IntegerType(),
            IntegerType(),
            IntegerType(),
            IntegerType()
        ]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))

        # after applying the types, '' should become None for int columns
        data = list(row_set)
        nones = [[x.value is None for x in row] for row in data]
        assert_equal(nones[0], [False, True, False, False])
        assert_equal(nones[1], [False, False, False, True])
        assert_equal(nones[2], [False, True, True, True])
예제 #5
0
    def test_apply_null_values(self):
        fh = horror_fobj('null.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        types = type_guess(row_set.sample, strict=True)
        expected_types = [
            IntegerType(),
            StringType(),
            IntegerType(),
            StringType()
        ]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))
        data = list(row_set)
        # treat null as non empty text and 0 as non empty integer
        assert [x.empty for x in data[0]] == [False, False, False, False]
        assert [x.empty for x in data[1]] == [False, False, False, False]
        assert [x.empty for x in data[2]] == [False, False, True, True]
        assert [x.empty for x in data[3]] == [False, False, False, False]
        assert [x.empty for x in data[4]] == [False, False, False, True]
        assert [x.empty for x in data[5]] == [False, False, False, True]

        # we expect None for Integers and "" for empty strings in CSV
        assert [x.value for x in data[2]] == [3, "null", None, ""], data[2]
예제 #6
0
def main(basic_config_file, batch_config_file):
    with open(basic_config_file, "r") as f:
        base_settings = yaml.load(f)

    if batch_config_file:
        # RUN MANY
        # parse csv into a list of settings-dicts
        import messytables
        with open(batch_config_file, "rb") as f:
            row_set = messytables.CSVRowSet("", f)
            offset, headers = messytables.headers_guess(row_set.sample)
            row_set.register_processor(messytables.headers_processor(headers))
            row_set.register_processor(messytables.offset_processor(offset +
                                                                    1))
            types = messytables.type_guess(row_set.sample, strict=True)
            row_set.register_processor(messytables.types_processor(types))
            settings_list = row_set.dicts()
        name = batch_config_file.replace(".csv", "")
        run_many(settings_list, name, base_settings=base_settings)
    else:
        # RUN ONE
        # parse yaml into a settings-dict
        settings_file = os.path.join(base_settings["out_dir"], "settings.yml")
        with open(settings_file, "w") as f:
            yaml.dump(base_settings, f)
        training_log, exit_status = run_one(**base_settings)
        training_log_file = os.path.join(base_settings["out_dir"],
                                         "training_log.csv")
        training_log.to_csv(training_log_file)
        stats = compute_final_stats(training_log)
        stats["exit_status"] = exit_status
        training_stats_file = os.path.join(base_settings["out_dir"],
                                           "training_stats.yml")
        with open(training_stats_file, "w") as f:
            yaml.dump(stats, f)
예제 #7
0
def analyze_csv(url, sample=1000):
    try:
        fileobj = urlopen(url)
        row_set = CSVRowSet('data', fileobj, window=sample)
        sample = list(row_set.sample)
        headers, sample = sample[0], sample[1:]
        # values = frequent_values(sample)
        types = type_guess(sample[500:], types=LIMITED_TYPES)
        mapping = {}
        for header, type_ in zip(headers, types):
            type_ = repr(type_).lower()
            name = slugify(header.value).lower()
            meta = {
                'label': header.value,
                'column': header.value,
                'datatype': type_
            }
            if type_ in ['decimal', 'integer', 'float']:
                meta['type'] = 'measure'
                meta['datatype'] = 'float'
            elif type_.startswith('date'):
                meta['type'] = 'date'
                meta['datatype'] = 'date'
            else:
                meta['type'] = 'attribute'
            mapping[name] = meta
        return {'columns': [h.value for h in headers], 'mapping': mapping}
    except Exception as e:
        log.exception(e)
        return {'error': unicode(e)}
예제 #8
0
def get_column_types(data: io.BytesIO) \
        -> Tuple[List[str], List[types.CellType]]:
    """derive the column types

  Using messytables' CSV API, attempt to derive the column types based on a
  best-guess of a sample of the rows.

  This is still a WIP due to the parlous state of the DV360/CM CSV data formats
  in general

  Arguments:
      data (io.BytesIO):  sample of the CSV file

  Returns:
      (List[str], List[str]): tuple of list of header names and list of
                                column types
  """
    table_set = messytables.CSVTableSet(data)
    row_set = table_set.tables[0]
    offset, csv_headers = messytables.headers_guess(row_set.sample)
    row_set.register_processor(messytables.headers_processor(csv_headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    csv_types = messytables.type_guess(row_set.sample, strict=True)

    return (csv_headers, csv_types)
예제 #9
0
 def test_strict_type_guessing_with_large_file(self):
     fh = horror_fobj('211.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 96)
     assert_equal(guessed_types, [
         IntegerType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), IntegerType(), StringType(), DecimalType(),
         DecimalType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), DateUtilType(),
         DateUtilType(), DateUtilType(), DateUtilType(), StringType(),
         StringType(), StringType()])
예제 #10
0
 def test_file_with_few_strings_among_integers(self):
     fh = horror_fobj('mixedGLB.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 19)
     print guessed_types
     assert_equal(guessed_types, [
         IntegerType(),
         IntegerType(),
         IntegerType(),
         IntegerType(),
         IntegerType(),
         IntegerType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         IntegerType(),
         StringType(),
         StringType()
     ])
예제 #11
0
 def test_strict_type_guessing_with_large_file(self):
     fh = horror_fobj('211.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 96)
     assert_equal(guessed_types, [
         IntegerType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), IntegerType(), StringType(), DecimalType(),
         DecimalType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), DateUtilType(),
         DateUtilType(), DateUtilType(), DateUtilType(), StringType(),
         StringType(), StringType()])
예제 #12
0
  def get_column_types(data: io.BytesIO) -> Tuple[List[str], List[str]]:
    """derive the column types

    Using messytables' CSV API, attempt to derive the column types based on a best-guess
    of a sample of the rows.

    This is still a WIP due to the parlous state of the DV360/CM CSV data formats in
    general
    
    Arguments:
        data {io.BytesIO} -- sample of the CSV file

    Returns:
        (List[str], List[str]) -- tuple of list of header names and list of column types
    """
    table_set = CSVTableSet(data)
    row_set = table_set.tables[0]
    offset, headers = headers_guess(row_set.sample)
    logging.info(headers)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    logging.info(types)

    return (headers, types)
예제 #13
0
    def test_null_process(self):
        fh = horror_fobj('null.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        row_set.register_processor(null_processor(['null']))
        data = list(row_set)

        nones = [[x.value is None for x in row] for row in data]
        assert_equal(nones[0], [False, True, False, False])
        assert_equal(nones[1], [False, False, False, True])
        assert_equal(nones[2], [False, True, False, False])

        types = type_guess(row_set.sample, strict=True)
        expected_types = [IntegerType(), BoolType(), BoolType(),
                          BoolType()]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))

        # after applying the types, '' should become None for int columns
        data = list(row_set)
        nones = [[x.value is None for x in row] for row in data]
        assert_equal(nones[0], [False, True, False, False])
        assert_equal(nones[1], [False, False, False, True])
        assert_equal(nones[2], [False, True, True, True])
예제 #14
0
def rowset_as_jts(rowset, headers=None, types=None):
    ''' Create a json table schema from a rowset
    '''
    _, headers = messytables.headers_guess(rowset.sample)
    types = map(celltype_as_string, messytables.type_guess(rowset.sample))

    return headers_and_typed_as_jts(headers, types)
예제 #15
0
    def create_new_model(self, modelname, app_label):
        """ Use messytables to guess field types and build a new model """

        nocols = False
        cols = self.csvfile[0]
        for col in cols:
            if not col:
                nocols = True
        if nocols:
            cols = ["col_%s" % num for num in range(1, len(cols))]
            print("No column names for %s columns" % len(cols))
        else:
            # strip quotes at ends and replace internal spaces with underscores
            cols = [col.strip("\r") for col in cols]
            cols = [col.strip('"') for col in cols]
            cols = [col.strip("'") for col in cols]
            cols = [cleancol.sub("_", col).lower() for col in cols]
        try:
            from messytables import any_tableset, type_guess
        except:
            self.errors.append(
                "If you want to inspect CSV files to generate model code, you must install https://messytables.readthedocs.org"
            )
            self.modelname = ""
            return
        try:
            table_set = any_tableset(self.filehandle)
            row_set = table_set.tables[0]
            types = type_guess(row_set.sample)
            types = [str(typeobj) for typeobj in types]
            # If the header has more cols than the data has cols - ignore the end ones
            if len(cols) > len(types):
                cols = cols[:len(types)]
        except Exception as err:
            self.errors.append("messytables could not run due to error")
            self.errors.append(str(err))
            self.modelname = ""
            return

        fieldset = []
        maximums = self.get_maxlengths(cols)
        for i, col in enumerate(cols):
            length = maximums[i]
            if types[i] == "String" and length > 255:
                types[i] = "Text"
            integer = length
            decimal = int(length / 2)
            if decimal > 10:
                decimal = 10
            blank = True
            default = True
            column = (col, types[i], length, length, integer, decimal, blank,
                      default)
            fieldset.append(column)
        # Import here so that messytables is not a dependency for just using csvimport cmd
        from csvimport.make_model import MakeModel

        maker = MakeModel()
        return maker.model_from_table("%s_%s" % (app_label, modelname),
                                      fieldset)
예제 #16
0
def generate_mapping(fileobj, sample=2000):
    row_set = CSVRowSet('data', fileobj, window=sample)
    sample = list(row_set.sample)
    headers, sample = sample[0], sample[1:]
    values = frequent_values(sample)
    types = type_guess(sample)
    mapping = {}
    for header, type_, value in zip(headers, types, values):
        type_ = repr(type_).lower()
        name = slugify(header.value).lower()
        meta = {
            'label': header.value,
            'column': header.value,
            'common_values': value,
            'datatype': type_
            }
        if type_ in ['decimal', 'integer', 'float']:
            meta['type'] = 'measure'
            meta['datatype'] = 'float'
        elif type_ in ['date']:
            meta['type'] = 'date'
            meta['datatype'] = 'date'
        else:
            meta['type'] = 'value'
        mapping[name] = meta
    return mapping
예제 #17
0
    def get_schema(self, filename):
        """
        Guess schema using messytables
        """
        table_set = self.read_file(filename)
            
        # Have I been able to read the filename
        if table_set is None: 
            return [] 

        # Get the first table as rowset
        row_set = table_set.tables[0]

        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        types = type_guess(row_set.sample, strict=True)

        # Get a sample as well..
        sample = next(row_set.sample)

        clean = lambda v: str(v) if not isinstance(v, str) else v 
        schema = []
        for i, h in enumerate(headers):
            schema.append([h,
                           str(types[i]),
                           clean(sample[i].value)])

        return schema
예제 #18
0
def rowset_as_jts(rowset, headers=None, types=None):
    ''' Create a json table schema from a rowset
    '''
    _, headers = messytables.headers_guess(rowset.sample)
    types = map(celltype_as_string, messytables.type_guess(rowset.sample))

    return headers_and_typed_as_jts(headers, types)
예제 #19
0
    def get_schema(self, filename):
        """
        Guess schema using messytables
        """
        table_set = self.read_file(filename)

        # Have I been able to read the filename
        if table_set is None:
            return []

        # Get the first table as rowset
        row_set = table_set.tables[0]

        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        types = type_guess(row_set.sample, strict=True)

        # Get a sample as well..
        sample = next(row_set.sample)

        clean = lambda v: str(v) if not isinstance(v, str) else v
        schema = []
        for i, h in enumerate(headers):
            schema.append([h, str(types[i]), clean(sample[i].value)])

        return schema
예제 #20
0
def main(argv=None):
    args = parse_args(argv)

    if args.file is None:
        # slurp the whole input since there seems to be a bug in messytables
        # which should be able to handle streams but doesn't
        args.file = cStringIO.StringIO(sys.stdin.read())

    relation_key = args_to_relation_key(args)

    table_set = any_tableset(args.file)
    if len(table_set.tables) != 1:
        raise ValueError("Can only handle files with a single table, not %s" % len(table_set.tables))

    row_set = table_set.tables[0]

    # guess header names and the offset of the header:
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(strip_processor())
    row_set.register_processor(headers_processor(headers))
    # Temporarily, mark the offset of the header
    row_set.register_processor(offset_processor(offset + 1))

    # guess types and register them
    types = type_guess(replace_empty_string(row_set.sample), strict=True, types=[StringType, DecimalType, IntegerType])
    row_set.register_processor(types_processor(types))

    # Messytables seems to not handle the case where there are no headers.
    # Work around this as follows:
    # 1) offset must be 0
    # 2) if the types of the data match the headers, assume there are
    #    actually no headers
    if offset == 0:
        try:
            [t.cast(v) for (t, v) in zip(types, headers)]
        except:
            pass
        else:
            # We don't need the headers_processor or the offset_processor
            row_set._processors = []
            row_set.register_processor(strip_processor())
            row_set.register_processor(types_processor(types))
            headers = None

    # Construct the Myria schema
    schema = messy_to_schema(types, headers)
    logging.info("Myria schema: {}".format(json.dumps(schema)))

    # Prepare data for writing to Myria
    data, kwargs = write_data(row_set, schema)

    if not args.dry:
        # Connect to Myria and send the data
        connection = myria.MyriaConnection(hostname=args.hostname, port=args.port, ssl=args.ssl)
        ret = connection.upload_file(relation_key, schema, data, args.overwrite, **kwargs)

        sys.stdout.write(pretty_json(ret))
    else:
        sys.stdout.write(data)
예제 #21
0
 def test_non_strict_guessing_handles_padding(self):
     csv_file = StringIO.StringIO('''
         1,   , 2
         2,   , 1.1
         foo, , 1500''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=False)
     assert_equal(len(guessed_types), 3)
     assert_equal(guessed_types, [IntegerType(), StringType(), DecimalType()])
예제 #22
0
 def test_non_strict_guessing_handles_padding(self):
     csv_file = StringIO.StringIO('''
         1,   , 2.1
         2,   , 1.1
         foo, , 1500''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=False)
     assert_equal(len(guessed_types), 3)
     assert_equal(guessed_types, [IntegerType(), StringType(), DecimalType()])
예제 #23
0
 def test_strict_guessing_handles_padding(self):
     csv_file = io.BytesIO(b'''
         1,   , 2
         2,   , 1.1
         foo, , 1500''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=True)
     assert_equal(len(guessed_types), 3)
     assert_equal(guessed_types,
                  [StringType(), StringType(), DecimalType()])
예제 #24
0
def parse(stream, excel_type='xls', sheet=1, guess_types=True, **kwargs):
    '''Parse Excel (xls or xlsx) to structured objects.

    :param excel_type: xls | xlsx
    :param sheet: index of sheet in spreadsheet to convert (starting from index = 1)
    '''
    sheet_number = int(sheet) - 1

    xlsclass = XLSTableSet
    if excel_type == 'xlsx':
        xlsclass = XLSXTableSet
    table_set = xlsclass.from_fileobj(stream)
    try:
        row_set = table_set.tables[sheet_number]
    except IndexError:
        raise Exception('This file does not have sheet number %d' %
                        (sheet_number + 1))
    offset, headers = headers_guess(row_set.sample)

    fields = []
    dup_columns = {}
    noname_count = 1
    if guess_types:
        guess_types = [
            StringType, IntegerType, FloatType, DecimalType, DateUtilType
        ]
        row_types = type_guess(row_set.sample, guess_types)
    for index, field in enumerate(headers):
        field_dict = {}
        if "" == field:
            field = '_'.join(['column', str(noname_count)])
            headers[index] = field
            noname_count += 1
        if headers.count(field) == 1:
            field_dict['id'] = field
        else:
            dup_columns[field] = dup_columns.get(field, 0) + 1
            field_dict['id'] = u'_'.join([field, str(dup_columns[field])])
        if guess_types:
            if isinstance(row_types[index], DateUtilType):
                field_dict['type'] = 'DateTime'
            else:
                field_dict['type'] = str(row_types[index])
        fields.append(field_dict)
    row_set.register_processor(headers_processor([x['id'] for x in fields]))
    row_set.register_processor(offset_processor(offset + 1))

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                data_row[cell.column] = cell.value
            yield data_row

    return row_iterator(), {'fields': fields}
예제 #25
0
def parse(stream, excel_type='xls', sheet=1, guess_types=True, **kwargs):
    '''Parse Excel (xls or xlsx) to structured objects.

    :param excel_type: xls | xlsx
    :param sheet: index of sheet in spreadsheet to convert (starting from index = 1)
    '''
    sheet_number = int(sheet) - 1

    xlsclass = XLSTableSet
    if excel_type == 'xlsx':
        xlsclass = XLSXTableSet
    table_set = xlsclass.from_fileobj(stream)
    try:
        row_set = table_set.tables[sheet_number]
    except IndexError:
        raise Exception('This file does not have sheet number %d' %
                        (sheet_number + 1))
    offset, headers = headers_guess(row_set.sample)

    fields = []
    dup_columns = {}
    noname_count = 1
    if guess_types:
        guess_types = [StringType, IntegerType, FloatType, DecimalType,
                       DateUtilType]
        row_types = type_guess(row_set.sample, guess_types)
    for index, field in enumerate(headers):
        field_dict = {}
        if "" == field:
            field = '_'.join(['column', str(noname_count)])
            headers[index] = field
            noname_count += 1
        if headers.count(field) == 1:
            field_dict['id'] = field
        else:
            dup_columns[field] = dup_columns.get(field, 0) + 1
            field_dict['id'] = u'_'.join([field, str(dup_columns[field])])
        if guess_types:
            if isinstance(row_types[index], DateUtilType):
                field_dict['type'] = 'DateTime'
            else:
                field_dict['type'] = str(row_types[index])
        fields.append(field_dict)
    row_set.register_processor(headers_processor([x['id'] for x in fields]))
    row_set.register_processor(offset_processor(offset + 1))

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                data_row[cell.column] = cell.value
            yield data_row

    return row_iterator(), {'fields': fields}
예제 #26
0
 def test_strict_guessing_handles_padding(self):
     csv_file = io.BytesIO(b'''
         1,   , 2
         2,   , 1.1
         foo, , 1500''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=True)
     assert_equal(len(guessed_types), 3)
     assert_equal(guessed_types,
                  [StringType(), StringType(),
                   DecimalType()])
예제 #27
0
    def test_json_type(self):
        csv_file = StringIO.StringIO('''
        "{""a"":""b"", ""c"":""d""}",       "[1, 2, 3]",                12a
        "[""a"", [1, 2, {""a"":""b""}]]",   "{""a"": 1, ""b"":[1, 2]}", abc
        ,,                                                              "abc"
        ''')

        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(rows.sample)

        assert_equal(guessed_types, [JsonType(), JsonType(), StringType()])
예제 #28
0
    def test_wkt_type(self):
        csv_file = StringIO.StringIO('''
        "0102000020e6100000020000000000000000002640000000000000474000000000000024400000000000804640",
        "0102000020787f0000020000000000000000002640000000000000474000000000000024400000000000804640", "SRID=4326;LINESTRING(11 46,10 45)"
        "0101000020e610000000000000000026400000000000004740", "SRID=4326;LINESTRING(11 46,10 45)"
        , "SRID=4326;POINT(11 46)"
        ''')

        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(rows.sample, strict=True)

        assert_equal(guessed_types, [EWKB(), EWKT()])
예제 #29
0
def rowset_as_schema(rowset):
    _, headers = messytables.headers_guess(rowset.sample)
    types = map(celltype_as_string, messytables.type_guess(rowset.sample))

    j = jsontableschema.JSONTableSchema()

    for field_id, field_type in zip(headers, types):
        j.add_field(field_id=field_id, 
                    label=field_id,
                    field_type=field_type)

    return j
예제 #30
0
    def connect(self,
                host=None,
                port=None,
                database=None,
                username=None,
                password=None,
                file=None):
        # TODO: mysql, pymssql, csv, sqlite3, pymongo, cx_Oracle
        self.database = database
        conn_string = ''
        if self.engine == 'psycopg2':
            if database:
                conn_string += "dbname='%s' " % database
            if username:
                conn_string += "user='******' " % username
            if host:
                conn_string += "host='%s' " % host
            if port:
                conn_string += "port='%s' " % port
            if password:
                conn_string += "password='******' " % password
            self.conn = psycopg2.connect(conn_string)

        elif self.engine == 'pymssql':
            self.conn = pymssql.connect(host,
                                        username,
                                        password,
                                        database,
                                        port=port,
                                        as_dict=True,
                                        charset='LATIN1')

        elif self.engine == 'csv':
            # https://messytables.readthedocs.io/en/latest/
            fh = StringIO.StringIO(self.data)
            #dialect = csv.Sniffer().sniff(f.read(1024))
            #f.seek(0)
            #self.conn = csv.DictReader(f, dialect=dialect)
            #fh = open('messy.csv', 'rb')

            # Load a file object:
            table_set = CSVTableSet(fh)
            row_set = table_set.tables[0]
            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))
            row_set.register_processor(offset_processor(offset + 1))
            types = type_guess(row_set.sample, strict=True)
            row_set.register_processor(types_processor(types))

            self.conn = row_set

        return self.conn
예제 #31
0
    def create_new_model(self, modelname, app_label):
        """ Use messytables to guess field types and build a new model """

        nocols = False
        cols = self.csvfile[0]
        for col in cols:
            if not col:
                nocols = True
        if nocols:
            cols = ['col_%s' % num for num in range(1, len(cols))]
            print('No column names for %s columns' % len(cols))
        else:
            cols = [cleancol.sub('_', col).lower() for col in cols]
        try:
            from messytables import any_tableset, type_guess
        except:
            self.errors.append(
                'If you want to inspect CSV files to generate model code, you must install https://messytables.readthedocs.org'
            )
            self.modelname = ''
            return
        try:
            table_set = any_tableset(self.filehandle)
            row_set = table_set.tables[0]
            types = type_guess(row_set.sample)
            types = [str(typeobj) for typeobj in types]
        except:
            self.errors.append('messytables could not guess your column types')
            self.modelname = ''
            return

        fieldset = []
        maximums = self.get_maxlengths(cols)
        for i, col in enumerate(cols):
            length = maximums[i]
            if types[i] == 'String' and length > 255:
                types[i] = 'Text'
            integer = length
            decimal = int(length / 2)
            if decimal > 10:
                decimal = 10
            blank = True
            default = True
            column = (col, types[i], length, length, integer, decimal, blank,
                      default)
            fieldset.append(column)
        # Import here so that messytables is not a dependency for just using csvimport cmd
        from csvimport.make_model import MakeModel
        maker = MakeModel()
        return maker.model_from_table('%s_%s' % (app_label, modelname),
                                      fieldset)
예제 #32
0
def proc(f, database_name, table_name):

    table_set = messytables.any_tableset(f)
    row_set = table_set.tables[0]

    # guess header names and the offset of the header:
    offset, headers = messytables.headers_guess(row_set.sample)
    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=[
        messytables.types.StringType,
        messytables.types.DateType,
    ], strict=True)
    hive_data_file = tempfile.NamedTemporaryFile(mode='w')

    fields_ddl = ','.join([
        '  {0} {1}\n'.format(
            canonicalize_column_name(colName),
            hive_column_type(colType)
        )
        for colName, colType in zip(headers, types)
    ])
    hive_sql = '''
DROP TABLE IF EXISTS {0};

CREATE TABLE {0} (
{1}
)
STORED AS TEXTFILE
TBLPROPERTIES ("comment"="add_messytable on {3}");

LOAD DATA LOCAL INPATH '{2}' OVERWRITE INTO TABLE {0};
'''.format(table_name, fields_ddl, hive_data_file.name,
        datetime.datetime.now().isoformat())

    hive_cmd_file = tempfile.NamedTemporaryFile(mode='w')
    print(hive_sql, file=hive_cmd_file)
    hive_cmd_file.flush()

    row_set.register_processor(messytables.types_processor(types))

    for row in row_set:
        print('\001'.join(map(str, [ c.value for c in row])),
                file=hive_data_file)
    hive_data_file.flush()

    subprocess.call([
        'hive',
        '--database', database_name,
        '-f', hive_cmd_file.name,
    ])
예제 #33
0
    def test_type_guess(self):
        csv_file = io.BytesIO(b'''
            1,   2012/2/12, 2,   02 October 2011,  yes,   1
            2,   2012/2/12, 2,   02 October 2011,  true,  1
            2.4, 2012/2/12, 1,   1 May 2011,       no,    0
            foo, bar,       1000, ,                false, 0
            4.3, ,          42,  24 October 2012,,
             ,   2012/2/12, 21,  24 December 2013, true,  1''')
        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(rows.sample)

        assert_equal(guessed_types, [
            DecimalType(), DateType('%Y/%m/%d'), IntegerType(),
            DateType('%d %B %Y'), BoolType(), BoolType()])
예제 #34
0
    def create_new_model(self, modelname, app_label):
        """ Use messytables to guess field types and build a new model """

        nocols = False
        cols = self.csvfile[0]
        for col in cols:
            if not col:
                nocols = True
        if nocols:
            cols = ['col_%s' % num for num in range(1, len(cols))]
            print ('No column names for %s columns' % len(cols))
        else:
            cols = [cleancol.sub('_', col).lower() for col in cols]
        try:
            from messytables import any_tableset, type_guess
        except:
            self.errors.append(
                'If you want to inspect CSV files to generate model code, you must install https://messytables.readthedocs.org')
            self.modelname = ''
            return

        try:
            table_set = any_tableset(self.filehandle)
            row_set = table_set.tables[0]
            types = type_guess(row_set.sample)
            types = [str(typeobj) for typeobj in types]
        except Exception as err:
            self.errors.append('messytables could not run due to error')
            self.errors.append(str(err))
            self.modelname = ''
            return

        fieldset = []
        maximums = self.get_maxlengths(cols)
        for i, col in enumerate(cols):
            length = maximums[i]
            if types[i] == 'String' and length > 255:
                types[i] = 'Text'
            integer = length
            decimal = int(length / 2)
            if decimal > 10:
                decimal = 10
            blank = True
            default = True
            column = (col, types[i], length, length, integer, decimal, blank, default)
            fieldset.append(column)
        # Import here so that messytables is not a dependency for just using csvimport cmd
        from csvimport.make_model import MakeModel
        maker = MakeModel()
        return maker.model_from_table('%s_%s' % (app_label, modelname), fieldset)
예제 #35
0
    def test_type_guess(self):
        csv_file = StringIO.StringIO('''
            1,   2012/2/12, 2,   02 October 2011,  yes,   1
            2,   2012/2/12, 2,   02 October 2011,  true,  1
            2.4, 2012/2/12, 1,   1 May 2011,       no,    0
            foo, bar,       1000, ,                false, 0
            4.3, ,          42,  24 October 2012,,
             ,   2012/2/12, 21,  24 December 2013, true,  1''')
        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(rows.sample)

        assert_equal(guessed_types, [
            DecimalType(), DateType('%Y/%m/%d'), IntegerType(),
            DateType('%d %B %Y'), BoolType(), BoolType()])
예제 #36
0
 def test_type_guess_forced(self):
     csv_file = StringIO.StringIO('''
         1,  aaa,    true
         2,  bbb,    false
         3,  ccc,
         4,  ,       yes
         5,  ddd,    no
     ''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(
         rows.sample,
         forced_types=[None, None, StringType()]
     )
     assert_equal(guessed_types, [IntegerType(), StringType(), StringType()])
예제 #37
0
    def test_type_guess(self):
        csv_file = StringIO.StringIO('''
            1,   2012/2/12, 2,   02 October 2011,  yes,     11
            2,   2012/2/12, 2,   02 October 2011,  true,    9 am
            2.4, 2012/2/12, 1,   1 May 2011,       no,      23:00.123
            foo, bar,       1000, ,                false,   12:00
            4.3, ,          42,  24 October 2012,  ,        7.12
             ,   2012/2/12, 21,  24 December 2013, true,    11PM ''')
        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(rows.sample)

        assert_equal(guessed_types, [
            DecimalType(), DateType('%Y/%m/%d'), IntegerType(),
            DateType('%d %B %Y'), BoolType(), TimeType()])
예제 #38
0
    def test_read_type_guess_simple(self):
        fh = horror_fobj('simple.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        types = type_guess(row_set.sample)
        expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))
        data = list(row_set)
        header_types = [c.type for c in data[0]]
        assert_equal(header_types, [StringType()] * 3)
        row_types = [c.type for c in data[2]]
        assert_equal(expected_types, row_types)
예제 #39
0
    def test_type_guess(self):
        csv_file = StringIO.StringIO('''
            1,   2012/2/12, 2,   02 October 2011
            2,   2012/2/12, 2,   02 October 2011
            2.4, 2012/2/12, 1,   1 May 2011
            foo, bar,       1000,
            4.3, ,          42,  24 October 2012
             ,   2012/2/12, 21,  24 December 2013''')
        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(rows.sample)

        assert_equal(guessed_types, [
            DecimalType(), DateType('%Y/%m/%d'),
            IntegerType(), DateType('%d %B %Y')])
예제 #40
0
def csvParse(csv_file_path):
    fh = open(csv_file_path, 'rb')
    # Load a file object:
    table_set = CSVTableSet(fh)
    row_set = table_set.tables[0]
    # guess header names and the offset of the header:
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    # add one to begin with content, not the header:
    row_set.register_processor(offset_processor(offset + 1))
    # guess column types:
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))
    return row_set, headers, offset, types
예제 #41
0
    def test_read_type_guess_simple(self):
        fh = horror_fobj("simple.csv")
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        types = type_guess(row_set.sample)
        expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))
        data = list(row_set)
        header_types = map(lambda c: c.type, data[0])
        assert_equal(header_types, [StringType()] * 3)
        row_types = map(lambda c: c.type, data[2])
        assert_equal(expected_types, row_types)
예제 #42
0
 def test_type_guess_strict(self):
     import locale
     locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8')
     csv_file = StringIO.StringIO('''
         1,   2012/2/12, 2,      2,02 October 2011,"100.234354"
         2,   2012/2/12, 1.1,    0,1 May 2011,"100,000,000.12"
         foo, bar,       1500,   0,,"NaN"
         4,   2012/2/12, 42,"-2,000",24 October 2012,"42"
         ,,,,,''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=True)
     assert_equal(guessed_types, [
         StringType(), StringType(),
         DecimalType(), IntegerType(), DateType('%d %B %Y'),
         DecimalType()])
예제 #43
0
 def test_type_guess_strict(self):
     import locale
     locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8')
     csv_file = StringIO.StringIO('''
         1,   2012/2/12, 2,      2,02 October 2011,"100.234354"
         2,   2012/2/12, 1.1,    0,1 May 2011,"100,000,000.12"
         foo, bar,       1500,   0,,"NaN"
         4,   2012/2/12, 42,"-2,000",24 October 2012,"42"
         ,,,,,''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=True)
     assert_equal(guessed_types, [
         StringType(), StringType(),
         DecimalType(), IntegerType(), DateType('%d %B %Y'),
         DecimalType()])
예제 #44
0
 def test_file_with_few_strings_among_integers(self):
     fh = horror_fobj('mixedGLB.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 19)
     print guessed_types
     assert_equal(guessed_types, [
         IntegerType(), IntegerType(),
         IntegerType(), IntegerType(), IntegerType(), IntegerType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), IntegerType(), StringType(),
         StringType()])
예제 #45
0
	def create_sql_table(self, rowset, sql_table_name=None, headers=None, types=None):
		"""
		Create a SQL table schema from a MessyTables RowSet
		"""

		# if a different name isn't specified, use the primary root name
		if not sql_table_name:
			sql_table_name = self.table_name
		
		# we don't care about the offset returned, so just throw it away, get headers
		_, headers = messytables.headers_guess(rowset.sample)
		types = map(self.celltype_as_string, messytables.type_guess(rowset.sample, strict=False))
		
		self.headers = headers
		self.header_types = types

		return self.headers_and_typed_as_sql(sql_table_name, headers, types)
예제 #46
0
파일: helpers.py 프로젝트: trickvi/spendb
def csvimport_table(name):
    from messytables import CSVTableSet, type_guess
    from messytables import types_processor, headers_guess
    from messytables import headers_processor, offset_processor
    from spendb.etl.extract import parse_table

    row_set = CSVTableSet(data_fixture(name)).tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    rows = []
    for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)):
        rows.append(row)

    return fields, rows
예제 #47
0
파일: helpers.py 프로젝트: trickvi/spendb
def csvimport_table(name):
    from messytables import CSVTableSet, type_guess
    from messytables import types_processor, headers_guess
    from messytables import headers_processor, offset_processor
    from spendb.etl.extract import parse_table

    row_set = CSVTableSet(data_fixture(name)).tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    rows = []
    for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)):
        rows.append(row)

    return fields, rows
예제 #48
0
파일: csv_utils.py 프로젝트: consbio/tablo
def prepare_csv_rows(csv_file):
    row_set = CSVTableSet(csv_file).tables[0]

    offset, headers = headers_guess(row_set.sample)
    headers = [convert_header_to_column_name(header) for header in (h for h in headers if h)]

    row_set.register_processor(headers_processor_remove_blank(headers))
    row_set.register_processor(offset_processor(offset + 1))

    DateType.formats = create_date_formats(day_first=False)

    # We are never wanting boolean types, so remove that from the default list
    eligible_types = [StringType, DecimalType, IntegerType, DateType]
    types = type_guess(row_set.sample, types=eligible_types, strict=True)

    row_set.register_processor(types_processor(types))

    return row_set
예제 #49
0
def parse_table(source):
    # This is a work-around because messytables hangs on boto file
    # handles, so we're doing it via plain old HTTP.
    # We're also passing in an extended window size to give more
    # reliable type detection.
    # Because Python's CSV dialect sniffer isn't the best, this also
    # constrains the field quoting character to a double quote.
    table_set = mt.any_tableset(source.fh(),
                                extension=source.meta.get('extension'),
                                mimetype=source.meta.get('mime_type'),
                                quotechar='"',
                                window=20000)
    tables = list(table_set.tables)
    if not len(tables):
        log.error("No tables were found in the source file.")
        return
    row_set = tables[0]
    headers = [c.value for c in next(row_set.sample)]
    row_set.register_processor(mt.headers_processor(headers))
    row_set.register_processor(mt.offset_processor(1))
    types = mt.type_guess(row_set.sample, strict=True)
    row_set.register_processor(mt.types_processor(types, strict=True))

    fields, i = {}, 0
    row_iter = iter(row_set)

    while True:
        i += 1
        try:
            row = row_iter.next()
            if not len(fields):
                fields = generate_field_spec(row)

            data = convert_row(row, fields, i)
            check_empty = set(data.values())
            if None in check_empty and len(check_empty) == 1:
                continue

            yield None, fields, data
        except StopIteration:
            return
        except Exception, e:
            # log.exception(e)
            yield e, fields, None
예제 #50
0
def parse_data(input):
    fh = open(input, 'rb')

    try:
        table_set = messytables.any_tableset(fh)
    except messytables.ReadError as e:
        print(e)

    get_row_set = lambda table_set: table_set.tables.pop()
    row_set = get_row_set(table_set)
    offset, headers = messytables.headers_guess(row_set.sample)
    # Some headers might have been converted from strings to floats and such.
    headers = [str(header) for header in headers]

    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=TYPES, strict=True)

    row_set.register_processor(messytables.types_processor(types))

    headers = [header.strip() for header in headers if header.strip()]
    headers_set = set(headers)

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                column_name = cell.column.strip()
                if column_name not in headers_set:
                    continue
                data_row[column_name] = cell.value
            yield data_row

    result = row_iterator()

    headers_dicts = [
        dict(id=field[0], type=TYPE_MAPPING[str(field[1])])
        for field in zip(headers, types)
    ]

    print('Determined headers and types: {headers}'.format(
        headers=headers_dicts))

    return headers_dicts, result
예제 #51
0
def parse_table(source):
    # This is a work-around because messytables hangs on boto file
    # handles, so we're doing it via plain old HTTP.
    # We're also passing in an extended window size to give more
    # reliable type detection.
    # Because Python's CSV dialect sniffer isn't the best, this also
    # constrains the field quoting character to a double quote.
    table_set = mt.any_tableset(source.fh(),
                                extension=source.meta.get('extension'),
                                mimetype=source.meta.get('mime_type'),
                                quotechar='"', window=20000)
    tables = list(table_set.tables)
    if not len(tables):
        log.error("No tables were found in the source file.")
        return
    row_set = tables[0]
    headers = [c.value for c in next(row_set.sample)]
    row_set.register_processor(mt.headers_processor(headers))
    row_set.register_processor(mt.offset_processor(1))
    types = mt.type_guess(row_set.sample, strict=True)
    row_set.register_processor(mt.types_processor(types, strict=True))

    fields, i = {}, 0
    row_iter = iter(row_set)

    while True:
        i += 1
        try:
            row = row_iter.next()
            if not len(fields):
                fields = generate_field_spec(row)

            data = convert_row(row, fields, i)
            check_empty = set(data.values())
            if None in check_empty and len(check_empty) == 1:
                continue

            yield None, fields, data
        except StopIteration:
            return
        except Exception, e:
            # log.exception(e)
            yield e, fields, None
예제 #52
0
    def _get_table_columns(self, csv_file_path: str) -> zip:
        """
        Read the csv file and tries to guess the the type of each column using messytables library.
        The type can be 'Integer', 'Decimal', 'String' or 'Bool'
        :param csv_file_path: path to the csv file with content in it
        :return: a Zip object where each tuple has two elements: the first is the column name and the second is the type
        """
        with gzip.open(csv_file_path, 'rb') as f:
            table_set = CSVTableSet(f)

            row_set = table_set.tables[0]

            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))

            row_set.register_processor(offset_processor(offset + 1))

            types = list(map(jts.celltype_as_string, type_guess(row_set.sample, strict=True)))
            return zip(headers, types)
예제 #53
0
    def test_apply_null_values(self):
        fh = horror_fobj('null.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        types = type_guess(row_set.sample, strict=True)
        expected_types = [IntegerType(), StringType(), IntegerType(), StringType()]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))
        data = list(row_set)
        # treat null as non empty text and 0 as non empty integer
        assert [x.empty for x in data[0]] == [False, False, False, False]
        assert [x.empty for x in data[1]] == [False, False, False, False]
        assert [x.empty for x in data[2]] == [False, False, True, True]
        assert [x.empty for x in data[3]] == [False, False, False, False]
        assert [x.empty for x in data[4]] == [False, False, False, True]
        assert [x.empty for x in data[5]] == [False, False, False, True]

        # we expect None for Integers and "" for empty strings in CSV
        assert [x.value for x in data[2]] == [3, "null", None, ""], data[2]
예제 #54
0
def resource_row_set(package, resource):
    """ Generate an iterator over all the rows in this resource's
    source data. """
    # This is a work-around because messytables hangs on boto file
    # handles, so we're doing it via plain old HTTP.
    table_set = any_tableset(resource.fh(),
                             extension=resource.meta.get('extension'),
                             mimetype=resource.meta.get('mime_type'))
    tables = list(table_set.tables)
    if not len(tables):
        log.error("No tables were found in the source file.")
        return

    row_set = tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))
    return row_set
예제 #55
0
def _guess_csv_datatype(fh):
    table_set = CSVTableSet(fh)
    row_set = table_set.tables[0]
    offset, headers = headers_guess(row_set.sample)
    logger.info("(offset, headers) = ({}, {})".format(offset, headers))

    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    counter = 0
    for row in row_set:
        logger.info(row)
        counter += 1
        if counter >= 32:
            break

    d = {h: t for h, t in zip(headers, types)}
    logger.info(d)
    return d
예제 #56
0
    def test_regex_type(self):
        csv_file = StringIO.StringIO('''
        aaa,    bbb,    ccc
        ,   ,
        aa,     bb,     cc
        a,      b,      c
        ''')

        class Type1(RegExType):
            regex = '^a+$'

        class Type2(RegExType):
            regex = '^b+$'

        rows = CSVTableSet(csv_file).tables[0]
        guessed_types =\
            type_guess(rows.sample, types=[Type1(), Type2(), StringType()])

        assert_equal(guessed_types, [
            Type1(), Type2(), StringType()
        ])
예제 #57
0
    def update(self, config):

        # Update the mime, sha1 of the files
        for i in range(len(config['files'])):
            filename = config['files'][i]['filename']
            if os.path.exists(filename):

                u = {
                    'mimetype': mimetypes.guess_type(filename)[0],
                    'sha1': compute_sha1(filename)
                }

                if filename.lower().endswith('sv'): # csv/tsv
                    rows = CSVTableSet(csv_file).tables[0]
                    guessed_types = type_guess(rows.sample)
                    u['schema'] = guessed_types

                config['files'][i].update(u)


        return config
예제 #58
0
def generate_schema(samples: List[Dict], table_spec: Dict) -> Dict:
    """
    Guess columns types from the given samples and build json schema
    :param samples: List of dictionaries containing samples data from csv file(s)
    :param table_spec: table/stream specs given in the tap definition
    :return: dictionary where the keys are the headers and values are the guessed types - compatible with json schema
    """
    schema = {}

    table_set = CSVTableSet(_csv2bytesio(samples))

    row_set = table_set.tables[0]

    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))

    types = type_guess(row_set.sample, strict=True)

    for header, header_type in zip(headers, types):

        date_overrides = set(table_spec.get('date_overrides', []))

        if header in date_overrides:
            schema[header] = {'type': ['null', 'string'], 'format': 'date-time'}
        else:
            if isinstance(header_type, IntegerType):
                schema[header] = {
                    'type': ['null', 'integer']
                }
            elif isinstance(header_type, DecimalType):
                schema[header] = {
                    'type': ['null', 'number']
                }
            else:
                schema[header] = {
                    'type': ['null', 'string']
                }

    return schema
예제 #59
0
def determine_messytables_types(file_handle, types=messytables.types.TYPES):
    """

    :param file_handle: file handle opened in binary mode
    :return: (headers, types, row_set)
    """

    # Load a file object:
    table_set = messytables.CSVTableSet(file_handle)

    # If you aren't sure what kind of file it is
    # table_set = messytables.any_tableset(file_handle)

    # A table set is a collection of tables:
    row_set = table_set.tables[0]

    # A row set is an iterator over the table, but it can only
    # be run once. To peek, a sample is provided:
    print(next(row_set.sample))

    # guess header names and the offset of the header:
    offset, headers = messytables.headers_guess(row_set.sample)
    row_set.register_processor(messytables.headers_processor(headers))

    # add one to begin with content, not the header:
    row_set.register_processor(messytables.offset_processor(offset + 1))

    # guess column types:
    types = messytables.type_guess(row_set.sample, types, strict=True)

    # and tell the row set to apply these types to
    # each row when traversing the iterator:
    row_set.register_processor(messytables.types_processor(types))

    # now run some operation on the data:
    return headers, types, row_set