Пример #1
0
    def test_string_match(self):
        args = [
            '-c', '1', '-m', 'ILLINOIS',
            'examples/realdata/FY09_EDU_Recipients_by_State.csv'
        ]
        output_file = StringIO.StringIO()
        utility = CSVGrep(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), [
            'State Name', 'State Abbreviate', 'Code',
            'Montgomery GI Bill-Active Duty',
            'Montgomery GI Bill- Selective Reserve',
            'Dependents\' Educational Assistance',
            'Reserve Educational Assistance Program',
            'Post-Vietnam Era Veteran\'s Educational Assistance Program',
            'TOTAL', ''
        ])
        self.assertEqual(reader.next(), [
            'ILLINOIS', 'IL', '17', '15,659', '2,491', '2,025', '1,770', '19',
            '21,964', ''
        ])
Пример #2
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = rows.next()

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = rows.next()

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        for i, row in enumerate(rows):
            out_row = [row[c] if c < len(row) else None for c in column_ids] 

            if self.args.delete_empty:
                if ''.join(out_row) == '':
                    continue
            
            output.writerow(out_row)
Пример #3
0
    def main(self):
        rows = CSVKitReader(self.args.file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = rows.next()

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = rows.next()

        column_names = self.args.columns.split(',')

        part_count = 0
        output = CSVKitWriter( open(self.args.file._lazy_args[0]+".part.%d" % part_count, 'w'), **self.writer_kwargs)
        output.writerow(column_names)

        count = 0
        for row in rows:
            if (self.args.lines > 0) and (count == self.args.lines):
                part_count += 1
                count = 0
                # couldn't find a better way to close the file
                del output
                output = CSVKitWriter( open(self.args.file._lazy_args[0]+".part.%d" % part_count, 'w'), **self.writer_kwargs)
                output.writerow(column_names)

            output.writerow(row)
            count += 1
Пример #4
0
    def main(self):
        rows = CSVKitReader(self.args.file, **self.reader_kwargs)

        # Make a default header row if none exists
        if self.args.no_header_row:
            row = rows.next()

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = rows.next()

        column_names = list(column_names)

        # prepend 'line_number' column with line numbers if --linenumbers option
        if self.args.line_numbers:
            column_names.insert(0, 'line_number')
            rows = [
                list(itertools.chain([str(i + 1)], row))
                for i, row in enumerate(rows)
            ]

        # Convert to normal list of rows
        rows = list(rows)

        # Insert the column names at the top
        rows.insert(0, column_names)

        widths = []

        for row in rows:
            for i, v in enumerate(row):
                try:
                    if len(v) > widths[i]:
                        widths[i] = len(v)
                except IndexError:
                    widths.append(len(v))

        # Dashes span each width with '+' character at intersection of
        # horizontal and vertical dividers.
        divider = '|--' + '-+-'.join('-' * w for w in widths) + '--|'

        self.output_file.write('%s\n' % divider)

        for i, row in enumerate(rows):
            output = []

            for j, d in enumerate(row):
                if d is None:
                    d = ''
                output.append(' %s ' % unicode(d).ljust(widths[j]))

            self.output_file.write(
                ('| %s |\n' % ('|'.join(output))).encode('utf-8'))

            if (i == 0 or i == len(rows) - 1):
                self.output_file.write('%s\n' % divider)
Пример #5
0
def sample_data(path, dialect_parameters, sample_size, encoding='utf-8'):
    with open(path, 'r') as f:
        reader = CSVKitReader(f, encoding=encoding, **dialect_parameters)

        try:
            reader.next() # skip headers
            samples = []

            for row in islice(reader, sample_size):
                samples.append(row)
        except UnicodeDecodeError:
            raise DataSamplingError(_('This CSV file contains characters that are not %s encoded. You need to input the correct encoding in order to import data from this file.') % (encoding))

        return samples
Пример #6
0
    def test_no_grouping(self):
        # stack two CSV files
        args = ['examples/dummy.csv', 'examples/dummy2.csv']
        output_file = StringIO.StringIO()
        utility = CSVStack(args, output_file)

        utility.main()

        # verify the stacked file's contents
        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['a', 'b', 'c'])
        self.assertEqual(reader.next()[0], '1')
        self.assertEqual(reader.next()[0], '1')
Пример #7
0
    def test_no_header_row(self):
        # stack two CSV files
        args = ['--no-header-row', 'examples/no_header_row.csv', 'examples/no_header_row2.csv']
        output_file = StringIO.StringIO()
        utility = CSVStack(args, output_file)

        utility.main()

        # verify the stacked file's contents
        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next()[0], 'column1')
        self.assertEqual(reader.next()[0], '1')
        self.assertEqual(reader.next()[0], '4')
Пример #8
0
    def test_explicit_grouping(self):
        # stack two CSV files
        args = ['--groups', 'asd,sdf', '-n', 'foo', 'examples/dummy.csv', 'examples/dummy2.csv']
        output_file = StringIO.StringIO()
        utility = CSVStack(args, output_file)

        utility.main()

        # verify the stacked file's contents
        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['foo', 'a', 'b', 'c'])
        self.assertEqual(reader.next()[0], 'asd')
        self.assertEqual(reader.next()[0], 'sdf')
Пример #9
0
    def from_csv(cls, f, name='from_csv_table', **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        sample = contents
        dialect = sniffer.sniff_dialect(sample, **kwargs)

        f = StringIO(contents) 
        reader = CSVKitReader(f, dialect=dialect, **kwargs)

        headers = reader.next()

        data_columns = [[] for c in headers] 

        for row in reader:
            for i, d in enumerate(row):
                try:
                    data_columns[i].append(d.strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns): 
            columns.append(Column(i, headers[i], c))

        return Table(columns, name=name)
Пример #10
0
    def main(self):
        """
        Convert CSV to JSON. 
        """
        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        stream = codecs.getwriter('utf-8')(self.output_file)

        if self.args.key:
            output = {}

            for row in rows:
                row_dict = dict(zip(column_names, row))
                k = row_dict[self.args.key]

                if k in output:
                    raise NonUniqueKeyColumnException(
                        'Value %s is not unique in the key column.' %
                        unicode(k))

                output[k] = row_dict
        else:
            output = [dict(zip(column_names, row)) for row in rows]

        json.dump(output,
                  stream,
                  ensure_ascii=False,
                  indent=self.args.indent,
                  encoding='utf-8')
Пример #11
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.regex and not self.args.pattern and not self.args.matchfile:
            self.argparser.error("One of -r, -m or -f must be specified, unless using the -n option.")

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based)
        
        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = set(line.rstrip() for line in self.args.matchfile)
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern
            
        patterns = dict((c, pattern) for c in column_ids)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        output.writerow(column_names)

        filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse)

        for i, row in enumerate(filter_reader):
            output.writerow(row)
Пример #12
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.regex and not self.args.pattern and not self.args.matchfile:
            self.argparser.error("One of -r, -m or -f must be specified, unless using the -n option.")

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based)
        
        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = [line.rstrip() for line in self.args.matchfile]
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern
            
        patterns = dict((c, pattern) for c in column_ids)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        output.writerow(column_names)

        filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse)

        for i, row in enumerate(filter_reader):
            output.writerow(row)
Пример #13
0
    def main(self):
        if len(self.args.files) < 2:
            self.argparser.error('You must specify at least two files to stack.')

        if self.args.group_by_filenames:
            groups = [os.path.split(f.name)[1] for f in self.args.files] 
        elif self.args.groups:
            groups = self.args.groups.split(',')

            if len(groups) != len(self.args.files):
                self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.')
        else:
            groups = None
                
        group_name = self.args.group_name if self.args.group_name else 'group'

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for i, f in enumerate(self.args.files):
            rows = CSVKitReader(f, **self.reader_kwargs)
            headers = rows.next()

            if i == 0:
                if groups:
                    headers.insert(0, group_name)
                
                output.writerow(headers)

            for row in rows:
                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)
Пример #14
0
def sample_data(path, dialect_parameters, sample_size, encoding='utf-8'):
    with open(path, 'r') as f:
        reader = CSVKitReader(f, encoding=encoding, **dialect_parameters)

        try:
            reader.next()  # skip headers
            samples = []

            for row in islice(reader, sample_size):
                samples.append(row)
        except UnicodeDecodeError:
            raise DataSamplingError(
                'This CSV file contains characters that are not %s encoded. You need to input the correct encoding in order to import data from this file.'
                % (encoding))

        return samples
Пример #15
0
    def from_csv(cls, f, name='from_csv_table', **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        sample = contents
        dialect = sniffer.sniff_dialect(sample)

        f = StringIO(contents)
        reader = CSVKitReader(f, dialect=dialect, **kwargs)

        headers = reader.next()

        data_columns = [[] for c in headers]

        for row in reader:
            for i, d in enumerate(row):
                try:
                    data_columns[i].append(d.strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(Column(i, headers[i], c))

        return Table(columns, name=name)
Пример #16
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        #Read in header and rows
        reader = CSVKitReader(self.input_file, **self.reader_kwargs)
        column_names = reader.next()
        if self.args.columns is None:
            grouped_columns_ids = []
        else:
            grouped_columns_ids = parse_column_identifiers(self.args.columns,
                                                       column_names,
                                                       self.args.zero_based)
        aggregations = []
        try:
            for (fun, cols) in map(lambda (f, cols): (
            f, parse_column_identifiers(cols, column_names, self.args.zero_based)),
                                   self.args.aggregations):
                for col in cols:
                    aggregations.append(aggregate_functions[fun](col))
        except KeyError:
            self.argparser.error("Wrong aggregator function. Available: " + ', '.join(aggregate_functions.keys()))
        #Determine columns to group by, default to all columns


        #Write the output
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        for row in group_rows(column_names, reader, grouped_columns_ids,
                              aggregations):
            output.writerow(row)
Пример #17
0
    def main(self):
        if len(self.args.files) < 2:
            sys.exit('You must specify at least two files to stack.')

        if self.args.group_by_filenames:
            groups = [os.path.split(f.name)[1] for f in self.args.files]
        elif self.args.groups:
            groups = self.args.groups.split(',')

            if len(groups) != len(self.args.files):
                sys.exit(
                    'The number of grouping values must be equal to the number of CSV files being stacked.'
                )
        else:
            groups = None

        group_name = self.args.group_name if self.args.group_name else 'group'

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for i, f in enumerate(self.args.files):
            rows = CSVKitReader(f, **self.reader_kwargs)
            headers = rows.next()

            if i == 0:
                if groups:
                    headers.insert(0, group_name)

                output.writerow(headers)

            for row in rows:
                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)
Пример #18
0
    def from_csv(cls,
                 f,
                 name='from_csv_table',
                 snifflimit=None,
                 column_ids=None,
                 blanks_as_nulls=True,
                 zero_based=False,
                 **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        if snifflimit:
            sample = contents[:snifflimit]
        else:
            sample = contents

        dialect = sniffer.sniff_dialect(sample)

        f = StringIO(contents)
        reader = CSVKitReader(f, dialect=dialect, **kwargs)

        headers = reader.next()

        if column_ids:
            column_ids = parse_column_identifiers(column_ids, headers,
                                                  zero_based)
            headers = [headers[c] for c in column_ids]
        else:
            column_ids = range(len(headers))

        data_columns = [[] for c in headers]

        for row in reader:
            for i, d in enumerate(row):
                try:
                    data_columns[i].append(row[column_ids[i]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(
                Column(column_ids[i],
                       headers[i],
                       c,
                       blanks_as_nulls=blanks_as_nulls))

        return Table(columns, name=name)
    def load(self):
        '''
            Loads the cleaned up csv files into the database
            Checks record count against csv line count
        '''
        ## get a list of tables in the database
        c = connection.cursor()
        c.execute('SHOW TABLES')
        table_list = [t[0] for t in c.fetchall()]

        ### build a dictionary of tables and the paths to the csvs for loading
        table_dict = {}
        for name in os.listdir(self.csv_dir):

            csv_path = os.path.join(
                self.csv_dir,
                name
            )

            for table in table_list:
                if table ==  name.replace('.csv', '').upper():
                    table_dict[name] = {'table_name': table, 'csv_path': csv_path}

        ## load up the data
        for csv_name, query_dict in table_dict.items():
            #print 'working on %s' % csv_name
            table_name = query_dict['table_name']
            csv_path = query_dict['csv_path']

            c.execute('DELETE FROM %s' % table_name)
            #print 'deleted records from %s' % table_name

            bulk_sql_load_part_1 = '''
                LOAD DATA LOCAL INFILE '%s'
                INTO TABLE %s
                FIELDS TERMINATED BY ','
                OPTIONALLY ENCLOSED BY '"'
                IGNORE 1 LINES
                (
            ''' % (csv_path, table_name)
            infile = open(csv_path)
            csv_reader = CSVKitReader(infile)
            headers = csv_reader.next()

            infile.close()
            infile = open(csv_path)
            csv_record_cnt = len(infile.readlines()) - 1
            infile.close()

            sql_fields = ['`%s`' % h for h in headers]
            bulk_sql_load =  bulk_sql_load_part_1 + ','.join(sql_fields) + ')'
            cnt = c.execute(bulk_sql_load)
            transaction.commit_unless_managed()

            # check load, make sure record count matches
            if cnt == csv_record_cnt:
                print "record counts match\t\t\t\t%s" % csv_name
            else:
                print 'table_cnt: %s\tcsv_lines: %s\t\t%s' % (cnt, csv_record_cnt, csv_name)
Пример #20
0
    def __init__(self, schema):
        self.fields = [] # A list of FixedWidthFields

        schema_reader = CSVKitReader(schema)
        schema_decoder = SchemaDecoder(schema_reader.next())

        for row in schema_reader:
            self.fields.append(schema_decoder(row))
Пример #21
0
def infer_types(f, sample_size=100):
    reader = CSVKitReader(f)
    headers = reader.next()

    sample = islice(reader, sample_size)
    normal_types, normal_values = normalize_table(sample)

    return zip(headers, [t.__name__ for t in normal_types])
Пример #22
0
    def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        # snifflimit == 0 means do not sniff
        if snifflimit is None:
            kwargs['dialect'] = sniffer.sniff_dialect(contents)
        elif snifflimit > 0:
            kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit])

        f = StringIO(contents)
        rows = CSVKitReader(f, **kwargs)

        if no_header_row:
            # Peek at a row to infer column names from
            row = next(rows) 

            headers = make_default_headers(len(row))
            column_ids = parse_column_identifiers(column_ids, headers, zero_based)
            headers = [headers[c] for c in column_ids]
            data_columns = [[] for c in headers]

            # Put row back on top
            rows = itertools.chain([row], rows)
        else:
            headers = rows.next()
            
            if column_ids:
                column_ids = parse_column_identifiers(column_ids, headers, zero_based)
                headers = [headers[c] for c in column_ids]
            else:
                column_ids = range(len(headers))
        
            data_columns = [[] for c in headers]

        for i, row in enumerate(rows):
            for j, d in enumerate(row):
                try:
                    data_columns[j].append(row[column_ids[j]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types))

        return Table(columns, name=name)
Пример #23
0
    def test_no_header_row(self):
        # stack two CSV files
        args = [
            '--no-header-row', 'examples/no_header_row.csv',
            'examples/no_header_row2.csv'
        ]
        output_file = StringIO.StringIO()
        utility = CSVStack(args, output_file)

        utility.main()

        # verify the stacked file's contents
        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next()[0], 'column1')
        self.assertEqual(reader.next()[0], '1')
        self.assertEqual(reader.next()[0], '4')
Пример #24
0
def print_column_names(f, output, **reader_kwargs):
    """
    Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout).
    """
    rows = CSVKitReader(f, **reader_kwargs)
    column_names = rows.next()

    for i, c in enumerate(column_names):
        output.write('%3i: %s\n' % (i + 1, c))
Пример #25
0
 def main(self):
   reader = CSVKitReader(self.args.file, **self.reader_kwargs)
   cnames = reader.next()
   cids   = parse_column_identifiers(self.args.columns, cnames, self.args.zero_based)
   mods   = {idx: self.args.expr for idx in cids}
   output = CSVKitWriter(self.output_file, **self.writer_kwargs)
   reader = sed.CsvFilter(reader, mods, header=False)
   output.writerow(cnames)
   for row in reader:
     output.writerow(row)
Пример #26
0
def extract_column_names(path, dialect_parameters, encoding='utf-8'):
    with open(path, 'r') as f:
        reader = CSVKitReader(f, encoding=encoding, **dialect_parameters)

        try:
            headers = reader.next()
        except UnicodeDecodeError:
            raise DataSamplingError(_('This CSV file contains characters that are not %s encoded. You need to input the correct encoding in order to import data from this file.') % encoding)

        return headers
Пример #27
0
def load_data(input_file):
    for file in glob.glob(input_file):
        print file
        open_file = open(input_file)
        grasp = CSVKitReader(open_file, encoding='utf-8', delimiter='\t')
        #grasp = csv.reader(open_file, delimiter="\t")
        grasp.next()  # skip header
#        bad_rows = []
        for row in grasp:
            #assert len(row) == VALID_COLUMN_NO
            try:
                one_snp_json = _map_line_to_json(row)
            #if one_snp_json:
                yield one_snp_json
            except:
                diff_rows = enumerate(row)
                wrong = [(i, row) for (i, row) in diff_rows]
                print wrong[-1]
                 
        open_file.close()
Пример #28
0
    def test_no_match(self):
        args = ['-c', '1', '-m', 'NO MATCH', 'examples/dummy.csv']
        output_file = StringIO.StringIO()
        utility = CSVGrep(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['a', 'b', 'c'])
Пример #29
0
 def main(self):
     reader = CSVKitReader(self.args.file, **self.reader_kwargs)
     cnames = reader.next()
     cids = parse_column_identifiers(self.args.columns, cnames,
                                     self.args.zero_based)
     mods = {idx: self.args.expr for idx in cids}
     output = CSVKitWriter(self.output_file, **self.writer_kwargs)
     reader = sed.CsvFilter(reader, mods, header=False)
     output.writerow(cnames)
     for row in reader:
         output.writerow(row)
Пример #30
0
    def __init__(self, schema):
        self.fields = [] # A list of FixedWidthFields

        schema_reader = CSVKitReader(schema)
        schema_decoder = SchemaDecoder(schema_reader.next())

        for i,row in enumerate(schema_reader):
            try:
                self.fields.append(schema_decoder(row))
            except Exception,e:
                raise ValueError("Error reading schema at line %i: %s" % (i + 2,e))
Пример #31
0
    def test_no_match(self):
        args = ['-c', '1', '-m', 'NO MATCH', 'examples/dummy.csv']
        output_file = StringIO.StringIO()
        utility = CSVGrep(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['a', 'b', 'c'])
Пример #32
0
    def test_include_and_exclude(self):
        args = ['-c', '1,3', '-C', '3', 'examples/dummy.csv']
        output_file = StringIO.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['a'])
        self.assertEqual(reader.next(), ['1'])
Пример #33
0
    def test_with_bzip2(self):
        args = ['-c', '1,3', 'examples/dummy.csv.bz2']
        output_file = StringIO.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['a', 'c'])
        self.assertEqual(reader.next(), ['1', '3'])
Пример #34
0
    def test_no_header_row(self):
        args = ['-c', '2', '--no-header-row', 'examples/no_header_row.csv']
        output_file = StringIO.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['column2'])
        self.assertEqual(reader.next(), ['2'])
Пример #35
0
    def test_string_match(self):
        args = ['-c', '1', '-m', 'ILLINOIS', 'examples/realdata/FY09_EDU_Recipients_by_State.csv']
        output_file = StringIO.StringIO()
        utility = CSVGrep(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['State Name', 'State Abbreviate', 'Code', 'Montgomery GI Bill-Active Duty', 'Montgomery GI Bill- Selective Reserve', 'Dependents\' Educational Assistance', 'Reserve Educational Assistance Program', 'Post-Vietnam Era Veteran\'s Educational Assistance Program', 'TOTAL', ''])
        self.assertEqual(reader.next(), ['ILLINOIS', 'IL', '17', '15,659', '2,491', '2,025', '1,770', '19', '21,964', ''])
Пример #36
0
    def test_include_and_exclude(self):
        args = ["-c", "1,3", "-C", "3", "examples/dummy.csv"]
        output_file = StringIO.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ["a"])
        self.assertEqual(reader.next(), ["1"])
Пример #37
0
    def test_with_bzip2(self):
        args = ["-c", "1,3", "examples/dummy.csv.bz2"]
        output_file = StringIO.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ["a", "c"])
        self.assertEqual(reader.next(), ["1", "3"])
Пример #38
0
    def test_simple(self):
        args = ['-c', '1,3', 'examples/dummy.csv']
        output_file = StringIO.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ['a', 'c'])
        self.assertEqual(reader.next(), ['1', '3'])
Пример #39
0
    def test_no_header_row(self):
        args = ["-c", "2", "--no-header-row", "examples/no_header_row.csv"]
        output_file = StringIO.StringIO()
        utility = CSVCut(args, output_file)

        utility.main()

        input_file = StringIO.StringIO(output_file.getvalue())
        reader = CSVKitReader(input_file)

        self.assertEqual(reader.next(), ["column2"])
        self.assertEqual(reader.next(), ["2"])
Пример #40
0
def extract_column_names(path, dialect_parameters, encoding='utf-8'):
    with open(path, 'r') as f:
        reader = CSVKitReader(f, encoding=encoding, **dialect_parameters)

        try:
            headers = reader.next()
        except UnicodeDecodeError:
            raise DataSamplingError(
                'This CSV file contains characters that are not %s encoded. You need to input the correct encoding in order to import data from this file.'
                % encoding)

        return headers
Пример #41
0
def sample_data(f, sample_size=5):
    reader = CSVKitReader(f)
    headers = reader.next()
        
    samples = []

    for i, row in enumerate(islice(reader, sample_size), start=1):
        samples.append({
            'row': i, 
            'data': row,
        })

    return samples 
Пример #42
0
def infer_schema(f, sample_size=100):
    reader = CSVKitReader(f)
    headers = reader.next()

    sample = islice(reader, sample_size)
    normal_types, normal_values = normalize_table(sample)
    type_names = [t.__name__ for t in normal_types]

    return [{
        'column': h,
        'simple_type': t,
        'meta_type': None,
        'indexed': False
    } for h, t in zip(headers, type_names)]
Пример #43
0
    def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, type_inference=True, **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        if snifflimit:
            sample = contents[:snifflimit]
        else:
            sample = contents

        dialect = sniffer.sniff_dialect(sample)

        normal_type = kwargs.pop("normal_type", InvalidType)

        f = StringIO(contents)
        reader = CSVKitReader(f, dialect=dialect, **kwargs)

        headers = reader.next()
        
        if column_ids:
            column_ids = parse_column_identifiers(column_ids, headers, zero_based)
            headers = [headers[c] for c in column_ids]
        else:
            column_ids = range(len(headers))
        
        data_columns = [[] for c in headers]

        for row in reader:
            for i, d in enumerate(row):
                try:
                    data_columns[i].append(row[column_ids[i]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, type_inference=type_inference, normal_type=normal_type))

        return Table(columns, name=name)
Пример #44
0
    def print_column_names(self):
        """
        Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout).
        """
        f = self.args.file
        output = self.output_file
        try:
            zero_based = self.args.zero_based
        except:
            zero_based = False

        rows = CSVKitReader(f, **self.reader_kwargs)
        column_names = rows.next()

        for i, c in enumerate(column_names):
            if not zero_based:
                i += 1
            output.write("%3i: %s\n" % (i, c))
Пример #45
0
    def print_column_names(self):
        """
        Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout).
        """
        f = self.args.file
        output = self.output_file
        try:
            zero_based = self.args.zero_based
        except:
            zero_based = False

        rows = CSVKitReader(f, **self.reader_kwargs)
        column_names = rows.next()

        for i, c in enumerate(column_names):
            if not zero_based:
                i += 1
            output.write('%3i: %s\n' % (i, c))
Пример #46
0
def guess_column_types(path, dialect, sample_size, encoding='utf-8'):
    """
    Guess column types based on a sample of data.
    """
    with open(path, 'r') as f:
        reader = CSVKitReader(f, encoding=encoding, **dialect)
        headers = reader.next()

        sample = islice(reader, sample_size)
        normal_types, normal_values = normalize_table(sample)

        type_names = []

        for t in normal_types:
            # csvkit recognizes dates and times separately, but we lump them together
            if t in [datetime.date, datetime.time]:
                type_names.append('datetime')
            else:
                type_names.append(t.__name__)

        return type_names 
Пример #47
0
    def print_column_names(self):
        """
        Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout).
        """
        if self.args.no_header_row:
            raise RequiredHeaderError, 'You cannot use --no-header-row with the -n or --names options.'

        f = self.args.file
        output = self.output_file
        try:
            zero_based=self.args.zero_based
        except:
            zero_based=False

        rows = CSVKitReader(f, **self.reader_kwargs)
        column_names = rows.next()

        for i, c in enumerate(column_names):
            if not zero_based:
                i += 1
            output.write('%3i: %s\n' % (i, c))
Пример #48
0
    def print_column_names(self):
        """
        Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout).
        """
        if self.args.no_header_row:
            raise RequiredHeaderError, 'You cannot use --no-header-row with the -n or --names options.'

        f = self.args.file
        output = self.output_file
        try:
            zero_based=self.args.zero_based
        except:
            zero_based=False

        rows = CSVKitReader(f, **self.reader_kwargs)
        column_names = rows.next()

        for i, c in enumerate(column_names):
            if not zero_based:
                i += 1
            output.write('%3i: %s\n' % (i, c))
Пример #49
0
    def main(self):
        if self.args.names_only:
            print_column_names(self.args.file, self.output_file,
                               **self.reader_kwargs)
            return

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        column_ids = parse_column_identifiers(self.args.columns, column_names)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        for i, row in enumerate(rows):
            self.input_line_number = i + 1
            out_row = [row[c] if c < len(row) else None for c in column_ids]

            if self.args.delete_empty:
                if ''.join(out_row) == '':
                    continue

            output.writerow(out_row)
Пример #50
0
def guess_column_types(path, dialect, sample_size, encoding='utf-8'):
    """
    Guess column types based on a sample of data.
    """
    with open(path, 'r') as f:
        reader = CSVKitReader(f, encoding=encoding, **dialect)
        headers = reader.next()

        sample = islice(reader, sample_size)
        normal_types, normal_values = normalize_table(sample)

        type_names = []

        for t in normal_types:
            if t is NoneType:
                type_names.append(None)
            else:
                type_names.append(t.__name__)

        # If a final column had no values csvkit will have dropped it
        while len(type_names) < len(headers):
            type_names.append(None)

        return type_names
Пример #51
0
    def from_csv(cls,
                 f,
                 name='from_csv_table',
                 snifflimit=None,
                 column_ids=None,
                 blanks_as_nulls=True,
                 zero_based=False,
                 infer_types=True,
                 no_header_row=False,
                 **kwargs):
        """
        Creates a new Table from a file-like object containing CSV data.

        Note: the column_ids argument will cause only those columns with a matching identifier
        to be parsed, type inferred, etc. However, their order/index property will reflect the
        original data (e.g. column 8 will still be "order" 7, even if it's the third column
        in the resulting Table.
        """
        # This bit of nonsense is to deal with "files" from stdin,
        # which are not seekable and thus must be buffered
        contents = f.read()

        # snifflimit == 0 means do not sniff
        if snifflimit is None:
            kwargs['dialect'] = sniffer.sniff_dialect(contents)
        elif snifflimit > 0:
            kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit])

        f = StringIO(contents)
        rows = CSVKitReader(f, **kwargs)

        if no_header_row:
            # Peek at a row to infer column names from
            row = next(rows)

            headers = make_default_headers(len(row))
            column_ids = range(len(row))
            data_columns = [[] for c in headers]

            # Put row back on top
            rows = itertools.chain([row], rows)
        else:
            headers = rows.next()

            if column_ids:
                column_ids = parse_column_identifiers(column_ids, headers,
                                                      zero_based)
                headers = [headers[c] for c in column_ids]
            else:
                column_ids = range(len(headers))

            data_columns = [[] for c in headers]

        for i, row in enumerate(rows):
            for j, d in enumerate(row):
                try:
                    data_columns[j].append(row[column_ids[j]].strip())
                except IndexError:
                    # Non-rectangular data is truncated
                    break

        columns = []

        for i, c in enumerate(data_columns):
            columns.append(
                Column(column_ids[i],
                       headers[i],
                       c,
                       blanks_as_nulls=blanks_as_nulls,
                       infer_types=infer_types))

        return Table(columns, name=name)
Пример #52
0
    def run(self,
            dataset_slug,
            upload_id,
            external_id_field_index=None,
            *args,
            **kwargs):
        """
        Execute import.
        """
        from panda.models import Dataset, DataUpload

        log = logging.getLogger(self.name)
        log.info('Beginning import, dataset_slug: %s' % dataset_slug)

        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning(
                'Import failed due to Dataset being deleted, dataset_slug: %s'
                % dataset_slug)

            return

        upload = DataUpload.objects.get(id=upload_id)

        task_status = dataset.current_task
        task_status.begin('Preparing to import')

        line_count = self._count_lines(upload.get_path())

        if self.is_aborted():
            task_status.abort('Aborted during preperation')

            log.warning('Import aborted, dataset_slug: %s' % dataset_slug)

            return

        f = open(upload.get_path(), 'r')

        reader = CSVKitReader(f,
                              encoding=upload.encoding,
                              **upload.dialect_as_parameters())
        reader.next()

        add_buffer = []
        data_typer = DataTyper(dataset.column_schema)
        throttle = config_value('PERF', 'TASK_THROTTLE')

        i = 0

        while True:
            # The row number which is about to be read, for error handling and indexing
            i += 1

            try:
                row = reader.next()
            except StopIteration:
                i -= 1
                break
            except UnicodeDecodeError:
                raise DataImportError(
                    'This CSV file contains characters that are not %s encoded in or after row %i. You need to re-upload this file and input the correct encoding in order to import data from this file.'
                    % (upload.encoding, i))

            external_id = None

            if external_id_field_index is not None:
                external_id = row[external_id_field_index]

            data = utils.solr.make_data_row(dataset,
                                            row,
                                            data_upload=upload,
                                            external_id=external_id)
            data = data_typer(data, row)

            add_buffer.append(data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(settings.SOLR_DATA_CORE, add_buffer)

                add_buffer = []

                task_status.update('%.0f%% complete (estimated)' %
                                   floor(float(i) / float(line_count) * 100))

                if self.is_aborted():
                    task_status.abort(
                        'Aborted after importing %.0f%% (estimated)' %
                        floor(float(i) / float(line_count) * 100))

                    log.warning('Import aborted, dataset_slug: %s' %
                                dataset_slug)

                    return

                time.sleep(throttle)

        if add_buffer:
            solr.add(settings.SOLR_DATA_CORE, add_buffer)
            add_buffer = []

        solr.commit(settings.SOLR_DATA_CORE)

        f.close()

        task_status.update('100% complete')

        # Refresh dataset from database so there is no chance of crushing changes made since the task started
        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning(
                'Import could not be completed due to Dataset being deleted, dataset_slug: %s'
                % dataset_slug)

            return

        if not dataset.row_count:
            dataset.row_count = i
        else:
            dataset.row_count += i

        dataset.column_schema = data_typer.schema

        dataset.save()

        # Refres
        upload = DataUpload.objects.get(id=upload_id)

        upload.imported = True
        upload.save()

        log.info('Finished import, dataset_slug: %s' % dataset_slug)

        return data_typer
Пример #53
0
        'description':
        'The crowdsourced jobs list that powers http://www.newsnerdjobs.com/.'
    }

    response = panda_put(PANDA_DATASET_URL,
                         json.dumps(dataset),
                         params={'columns': ','.join(COLUMNS)})

# Open connection to Google
response = requests.get(
    'https://docs.google.com/spreadsheet/pub?key=%s&single=true&gid=4&output=csv'
    % SPREADSHEET_ID)
csv = StringIO(response.content)

reader = CSVKitReader(csv)
reader.next()

put_data = {'objects': []}

# Delete existing data in panda
response = panda_delete(PANDA_DATA_URL)

for i, row in enumerate(reader):
    put_data['objects'].append({'data': row})

    if i and i % PANDA_BULK_UPDATE_SIZE == 0:
        print 'Updating %i rows...' % PANDA_BULK_UPDATE_SIZE

        panda_put(PANDA_DATA_URL, json.dumps(put_data))
        put_data['objects'] = []
Пример #54
0
    def main(self):
        """
        Convert CSV to JSON. 
        """
        if self.args.lat and not self.args.lon:
            self.argparser.error(
                '--lon is required whenever --lat is specified.')

        if self.args.lon and not self.args.lat:
            self.argparser.error(
                '--lat is required whenever --lon is specified.')

        if self.args.crs and not self.args.lat:
            self.argparser.error(
                '--crs is only allowed when --lat and --lon are also specified.'
            )

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        stream = codecs.getwriter('utf-8')(self.output_file)

        # GeoJSON
        if self.args.lat and self.args.lon:
            features = []
            min_lon = None
            min_lat = None
            max_lon = None
            max_lat = None

            lat_column = match_column_identifier(column_names, self.args.lat,
                                                 self.args.zero_based)
            lon_column = match_column_identifier(column_names, self.args.lon,
                                                 self.args.zero_based)

            if self.args.key:
                id_column = match_column_identifier(column_names,
                                                    self.args.key,
                                                    self.args.zero_based)
            else:
                id_column = None

            for row in rows:
                feature = {'type': 'Feature'}
                properties = {}
                geoid = None
                lat = None
                lon = None

                for i, c in enumerate(row):
                    if i == lat_column:
                        lat = float(c)

                        if min_lat is None or lat < min_lat:
                            min_lat = lat

                        if max_lat is None or lat > max_lat:
                            max_lat = lat
                    elif i == lon_column:
                        lon = float(c)

                        if min_lon is None or lon < min_lon:
                            min_lon = lon

                        if max_lon is None or lon > max_lon:
                            max_lon = lon
                    elif id_column is not None and i == id_column:
                        geoid = c
                    else:
                        properties[column_names[i]] = c

                if id_column is not None:
                    feature['id'] = geoid

                feature['geometry'] = {
                    'type': 'Point',
                    'coordinates': [lon, lat]
                }

                feature['properties'] = properties

                features.append(feature)

            output = {
                'type': 'FeatureCollection',
                'bbox': [min_lon, min_lat, max_lon, max_lat],
                'features': features
            }

            if self.args.crs:
                output['crs'] = {
                    'type': 'name',
                    'properties': {
                        'name': self.args.crs
                    }
                }
        # Keyed JSON
        elif self.args.key:
            output = {}

            for row in rows:
                row_dict = dict(zip(column_names, row))
                k = row_dict[self.args.key]

                if k in output:
                    raise NonUniqueKeyColumnException(
                        'Value %s is not unique in the key column.' %
                        unicode(k))

                output[k] = row_dict
        # Boring JSON
        else:
            output = [dict(zip(column_names, row)) for row in rows]

        json.dump(output,
                  stream,
                  ensure_ascii=False,
                  indent=self.args.indent,
                  encoding='utf-8')