Python sniff_dialect 예제들, csvkit.sniffer.sniff_dialect Python 예제들

예제 #1

0

파일 보기

파일: table.py 프로젝트: NickolasLapp/csvkit

    def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        # snifflimit == 0 means do not sniff
        if snifflimit is None:
            kwargs['dialect'] = sniffer.sniff_dialect(contents)
        elif snifflimit > 0:
            kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit])

        f = six.StringIO(contents)
        rows = CSVKitReader(f, **kwargs)

        if no_header_row:
            # Peek at a row to infer column names from
            row = next(rows) 

            headers = make_default_headers(len(row))
            column_ids = parse_column_identifiers(column_ids, headers, zero_based)
            headers = [headers[c] for c in column_ids]
            data_columns = [[] for c in headers]

            # Put row back on top
            rows = itertools.chain([row], rows)
        else:
            headers = next(rows)
            
            if column_ids:
                column_ids = parse_column_identifiers(column_ids, headers, zero_based)
                headers = [headers[c] for c in column_ids]
            else:
                column_ids = range(len(headers))
        
            data_columns = [[] for c in headers]

        for i, row in enumerate(rows):
            for j, d in enumerate(row):
                try:
                    data_columns[j].append(row[column_ids[j]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types))

        return Table(columns, name=name)

예제 #2

0

파일 보기

    def from_csv(cls, f, name='from_csv_table', **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        sample = contents
        dialect = sniffer.sniff_dialect(sample)

        f = StringIO(contents)
        reader = CSVKitReader(f, dialect=dialect, **kwargs)

        headers = reader.next()

        data_columns = [[] for c in headers]

        for row in reader:
            for i, d in enumerate(row):
                try:
                    data_columns[i].append(d.strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(Column(i, headers[i], c))

        return Table(columns, name=name)

예제 #3

0

파일 보기

파일: table.py 프로젝트: thatmattbone/csvkit

    def from_csv(cls, f, name='from_csv_table', **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        sample = contents
        dialect = sniffer.sniff_dialect(sample, **kwargs)

        f = StringIO(contents) 
        reader = CSVKitReader(f, dialect=dialect, **kwargs)

        headers = reader.next()

        data_columns = [[] for c in headers] 

        for row in reader:
            for i, d in enumerate(row):
                try:
                    data_columns[i].append(d.strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns): 
            columns.append(Column(i, headers[i], c))

        return Table(columns, name=name)

예제 #4

0

파일 보기

파일: load_data.py 프로젝트: eklucas/IRS_Migration

def load_file(file_name, db_engine):
	# Pass file name and SQLAlchemy db engine
	# Opens file, makes a file object, creates table
	# Builds and runs a COPY FROM statement

	print '-----------------'

	print '   Opening {} ({})...'.format(file_name, datetime.now() - start_time)

	f = codecs.open('data/{}'.format(file_name), 'rU')

	print '   Sniffing file dialect ({})...'.format(datetime.now() - start_time)

	dialect = sniffer.sniff_dialect(f.read())

	# Return to top of file
	f.seek(0)

	print '   Making csv Table object ({})...'.format(datetime.now() - start_time)

	from_file = Table.from_csv(f, name = file_name.rstrip('.csv'), encoding = 'utf-8') # Here be some overhead

	print '   Making SQLAlchemy Table object ({})...'.format(datetime.now() - start_time)

	sql_table = sql.make_table(from_file)

	print '   Creating db table ({})...'.format(datetime.now() - start_time)

	sql_table.create(engine, checkfirst=True)

	print '   Loading {} ({})...'.format(file_name, datetime.now() - start_time)

	copy_from_sql = '''COPY "{table_name}"
					FROM '{file_w_path}'
					DELIMITER '{delimiter}'
					QUOTE '{quote_character}'
					ENCODING 'UTF8'
					CSV
					HEADER;'''.format(
							  table_name = file_name.rstrip('.csv')
							, file_w_path = os.getcwd() + '/data/' + file_name
							, delimiter = dialect.delimiter
							, quote_character = dialect.quotechar
							, escape_character = '' if dialect.escapechar is None else dialect.escapechar 
						)

	conn = db_engine.connect() 

	t = conn.begin()

	try:
		conn.execute(copy_from_sql)
		t.commit()
	except:
		t.rollback()
		print copy_from_sql.replace('\t', '')
		print "Failed to commit."

	conn.close()

예제 #5

0

파일 보기

파일: table.py 프로젝트: higs4281/csvkit

    def from_csv(cls,
                 f,
                 name='from_csv_table',
                 snifflimit=None,
                 column_ids=None,
                 blanks_as_nulls=True,
                 zero_based=False,
                 **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        if snifflimit:
            sample = contents[:snifflimit]
        else:
            sample = contents

        dialect = sniffer.sniff_dialect(sample)

        f = StringIO(contents)
        reader = CSVKitReader(f, dialect=dialect, **kwargs)

        headers = reader.next()

        if column_ids:
            column_ids = parse_column_identifiers(column_ids, headers,
                                                  zero_based)
            headers = [headers[c] for c in column_ids]
        else:
            column_ids = range(len(headers))

        data_columns = [[] for c in headers]

        for row in reader:
            for i, d in enumerate(row):
                try:
                    data_columns[i].append(row[column_ids[i]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(
                Column(column_ids[i],
                       headers[i],
                       c,
                       blanks_as_nulls=blanks_as_nulls))

        return Table(columns, name=name)

예제 #6

0

파일 보기

파일: listreader.py 프로젝트: ubc/isw-registration-script

def getReader(uploadfile):
    file = uploadfile.file
    extension = os.path.splitext(uploadfile.filename)[1]

    csvreader = None
    # make sure to convert excel files
    if extension == '.xls':
        file = StringIO.StringIO(xls2csv(file))
        csvreader = reader(file)
    else:
        dialect = sniffer.sniff_dialect(file.read(4096))
        file.seek(0)
        csvreader = reader(file, dialect=dialect)
    return csvreader

예제 #7

0

파일 보기

파일: listreader.py 프로젝트: robbykyle/isw-registration-script

def getReader(uploadfile):
	file = uploadfile.file
	extension = os.path.splitext(uploadfile.filename)[1]

	csvreader = None
	# make sure to convert excel files
	if extension == '.xls':
		file = StringIO.StringIO(xls2csv(file))
		csvreader = reader(file)
	else:
		dialect = sniffer.sniff_dialect(file.read(4096))
		file.seek(0)
		csvreader = reader(file, dialect=dialect)
	return csvreader

예제 #8

0

파일 보기

파일: table.py 프로젝트: SpazioDati/csvkit

    def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, type_inference=True, **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        if snifflimit:
            sample = contents[:snifflimit]
        else:
            sample = contents

        dialect = sniffer.sniff_dialect(sample)

        normal_type = kwargs.pop("normal_type", InvalidType)

        f = StringIO(contents)
        reader = CSVKitReader(f, dialect=dialect, **kwargs)

        headers = reader.next()
        
        if column_ids:
            column_ids = parse_column_identifiers(column_ids, headers, zero_based)
            headers = [headers[c] for c in column_ids]
        else:
            column_ids = range(len(headers))
        
        data_columns = [[] for c in headers]

        for row in reader:
            for i, d in enumerate(row):
                try:
                    data_columns[i].append(row[column_ids[i]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, type_inference=type_inference, normal_type=normal_type))

        return Table(columns, name=name)

예제 #9

0

파일 보기

파일: type_detection.py 프로젝트: cccnam5158/DIVE-backend

def detect_if_list(v):
    '''
    Detects list using csvkit sniffer to detect delimiter, splitting on delim
    and filtering common delims to see final list length.

    Returns either a list or False
    '''
    delimiters = ['|', ',', ';', '$']

    LIST_LEN_THRESHOLD = 2
    dialect = sniffer.sniff_dialect(v)
    if dialect:
        delim = dialect.delimiter
        split_vals = v.split(delim)
        filtered_vals = [ x for x in split_vals if (x and (x not in delimiters)) ]
        if filtered_vals >= 2:
            return filtered_vals
    return False

예제 #10

0

파일 보기

파일: type_detection.py 프로젝트: vikrant-sahu/DIVE-backend

def detect_if_list(v):
    '''
    Detects list using csvkit sniffer to detect delimiter, splitting on delim
    and filtering common delims to see final list length.

    Returns either a list or False
    '''
    delimiters = ['|', ',', ';', '$']

    LIST_LEN_THRESHOLD = 2
    dialect = sniffer.sniff_dialect(v)
    if dialect:
        delim = dialect.delimiter
        split_vals = v.split(delim)
        filtered_vals = [
            x for x in split_vals if (x and (x not in delimiters))
        ]
        if filtered_vals >= 2:
            return filtered_vals
    return False

예제 #11

0

파일 보기

파일: table.py 프로젝트: gregorysimoes/csvkit

    def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        # snifflimit == 0 means do not sniff
        if snifflimit is None:
            kwargs['dialect'] = sniffer.sniff_dialect(contents)
        elif snifflimit > 0:
            kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit])

        f = six.StringIO(contents)
        rows = agate.reader(f, **kwargs)

        try:
            if no_header_row:
                # Peek at a row to infer column names from, and put it back on top
                row = next(rows)
                rows = itertools.chain([row], rows)
                headers = make_default_headers(len(row))
            else:
                headers = next(rows)
        except StopIteration:
            # The file is `/dev/null`.
            headers = []
            pass

        if no_header_row or column_ids:
            column_ids = parse_column_identifiers(column_ids, headers, zero_based)
            headers = [headers[c] for c in column_ids]
        else:
            column_ids = range(len(headers))

        data_columns = [[] for c in headers]
        width = len(data_columns)

        for i, row in enumerate(rows):
            j = 0

            for j, d in enumerate(row):
                try:
                    data_columns[j].append(row[column_ids[j]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

            j += 1

            # Populate remaining columns with None
            while j < width:
                data_columns[j].append(None)

                j += 1

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types))

        return Table(columns, name=name)