示例#1
0
文件: xlsx.py 项目: srkgupta/csvkit
def xlsx2csv(f, output=None, **kwargs):
    """
    Convert an Excel .xlsx file to csv.

    Note: Unlike other convertor's, this one allows output columns to contain mixed data types.
    Blank headers are also possible.
    """
    streaming = True if output else False


    book = load_workbook(f, use_iterators=True, data_only=True)
    sheets = book.get_sheet_names()
    fname = os.path.splitext(f.name)[0];

    for name in sheets:
        outputfname = fname + "_" + name.replace(" ", "") + ".csv"
        of = open(outputfname,'w')
        sheet = book.get_sheet_by_name(name)
        if not streaming:
            output = six.StringIO()
            writer = CSVKitWriter(output)

        for i, row in enumerate(sheet.iter_rows()):
            if i == 0:
                writer.writerow([c.value for c in row]) 
                continue

            out_row = []

            for c in row:
                value = c.value

                if value.__class__ is datetime.datetime:
                    # Handle default XLSX date as 00:00 time 
                    if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c):
                        value = value.time() 

                        value = normalize_datetime(value)
                    elif value.time() == NULL_TIME:
                        value = value.date()
                    else:
                        value = normalize_datetime(value)
                elif value.__class__ is float:
                    if value % 1 == 0:
                        value = int(value)

                if value.__class__ in (datetime.datetime, datetime.date, datetime.time):
                    value = value.isoformat()

                out_row.append(value)

            writer.writerow(out_row)

        if not streaming:
            data = output.getvalue()
            of.write(data)
            of.close()            

    # Return empty string when streaming
    return ''
示例#2
0
文件: csvcut.py 项目: gepuro/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        rows = CSVKitReader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        for row in rows:
            out_row = [row[c] if c < len(row) else None for c in column_ids]

            if self.args.delete_empty:
                if ''.join(out_row) == '':
                    continue

            output.writerow(out_row)
示例#3
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        #Read in header and rows
        reader = CSVKitReader(self.input_file, **self.reader_kwargs)
        column_names = reader.next()
        if self.args.columns is None:
            grouped_columns_ids = []
        else:
            grouped_columns_ids = parse_column_identifiers(self.args.columns,
                                                       column_names,
                                                       self.args.zero_based)
        aggregations = []
        try:
            for (fun, cols) in map(lambda (f, cols): (
            f, parse_column_identifiers(cols, column_names, self.args.zero_based)),
                                   self.args.aggregations):
                for col in cols:
                    aggregations.append(aggregate_functions[fun](col))
        except KeyError:
            self.argparser.error("Wrong aggregator function. Available: " + ', '.join(aggregate_functions.keys()))
        #Determine columns to group by, default to all columns


        #Write the output
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        for row in group_rows(column_names, reader, grouped_columns_ids,
                              aggregations):
            output.writerow(row)
    def log_errors(self, rows):
        """
        Log any errors to a csv file
        """
        # Make sure the log directory exists
        os.path.exists(self.log_dir) or os.makedirs(self.log_dir)

        # Log writer
        log_path = os.path.join(
            self.log_dir,
            self.file_name.lower().replace("tsv", "errors.csv")
        )
        log_file = open(log_path, 'w')
        log_writer = CSVKitWriter(log_file, quoting=csv.QUOTE_ALL)

        # Add the headers
        log_writer.writerow([
            'Line number',
            'Headers len',
            'Fields len',
            'Line value'
        ])

        # Log out the rows
        log_writer.writerows(rows)

        # Shut it down
        log_file.close()
示例#5
0
    def main(self):
        try:
            engine, metadata = sql.get_connection(self.args.connection_string)
        except ImportError:
            raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use.. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n')

        conn = engine.connect()

        if self.args.query:
            query = self.args.query.strip()
        else:
            query = ""

            for line in self.args.file:
                query += line

        rows = conn.execute(query)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        if not self.args.no_header_row:
            output.writerow(rows._metadata.keys)

        for row in rows:
            output.writerow(row)

        conn.close()
示例#6
0
    def main(self):
        if self.args.names_only:
            print_column_names(self.args.file, self.output_file,
                               **self.reader_kwargs)
            return

        if self.args.file.name != '<stdin>':
            # Use filename as table name
            table_name = os.path.splitext(
                os.path.split(self.args.file.name)[1])[0]
        else:
            table_name = 'csvsql_table'

        tab = table.Table.from_csv(self.args.file,
                                   name=table_name,
                                   snifflimit=self.args.snifflimit,
                                   **self.reader_kwargs)
        column_ids = parse_column_identifiers(self.args.columns, tab.headers())

        rows = tab.to_rows(serialize_dates=True)
        rows.sort(key=lambda r: [r[c] for c in column_ids],
                  reverse=self.args.reverse)

        rows.insert(0, tab.headers())

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for row in rows:
            output.writerow(row)
示例#7
0
文件: csvsort.py 项目: bucweat/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if self.input_file.name != '<stdin>':
            # Use filename as table name
            table_name = os.path.splitext(
                os.path.split(self.input_file.name)[1])[0]
        else:
            table_name = 'csvsql_table'

        tab = table.Table.from_csv(self.input_file,
                                   name=table_name,
                                   snifflimit=self.args.snifflimit,
                                   no_header_row=self.args.no_header_row,
                                   infer_types=(not self.args.no_inference),
                                   **self.reader_kwargs)

        column_ids = parse_column_identifiers(self.args.columns, tab.headers(),
                                              self.args.zero_based)

        rows = tab.to_rows(serialize_dates=True)
        sorter = lambda r: [
            r[c] if r[c] is not None else '' for c in column_ids
        ]
        rows.sort(key=sorter, reverse=self.args.reverse)

        rows.insert(0, tab.headers())

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for row in rows:
            output.writerow(row)
示例#8
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.regex and not self.args.pattern and not self.args.matchfile:
            self.argparser.error("One of -r, -m or -f must be specified, unless using the -n option.")

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based)
        
        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = set(line.rstrip() for line in self.args.matchfile)
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern
            
        patterns = dict((c, pattern) for c in column_ids)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        output.writerow(column_names)

        filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse)

        for i, row in enumerate(filter_reader):
            output.writerow(row)
示例#9
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if self.input_file.name != '<stdin>':
            # Use filename as table name
            table_name = os.path.splitext(os.path.split(self.input_file.name)[1])[0]
        else:
            table_name = 'csvsql_table'

        tab = table.Table.from_csv(
            self.input_file,
            name=table_name,
            snifflimit=self.args.snifflimit,
            no_header_row=self.args.no_header_row,
            infer_types=(not self.args.no_inference),
            **self.reader_kwargs
        )
        
        column_ids = parse_column_identifiers(self.args.columns, tab.headers(), self.args.zero_based)

        rows = tab.to_rows(serialize_dates=True) 
        sorter = lambda r: [r[c] if r[c] is not None else '' for c in column_ids]
        rows.sort(key=sorter, reverse=self.args.reverse)
        
        rows.insert(0, tab.headers())

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for row in rows:
            output.writerow(row)
示例#10
0
def fixed2csv(f, schema, output=None, **kwargs):
    """
    Convert a fixed-width file to csv using a CSV-formatted schema description.

    A schema CSV must start with a header row with (at least) columns labeled "column","start", and "length". (Other columns will be ignored.) For each subsequent row, therefore, those columns will be used to identify a column name, the starting index of the column (an integer), and the length of the column (also an integer).
    
    Values in the 'start' column are assumed to be zero-based, unless the first value for 'start' is 1, in which case all values are assumed to be one-based.

    If output is specified, rows will be written to that object, otherwise the complete data will be returned.
    """
    streaming = True if output else False

    if not streaming:
        output = six.StringIO()

    try:
        encoding = kwargs['encoding']
    except KeyError:
        encoding = None

    writer = CSVKitWriter(output)

    reader = FixedWidthReader(f, schema, encoding=encoding)
    writer.writerows(reader)

    if not streaming:
        data = output.getvalue()
        return data
    
    # Return empty string when streaming
    return ''
示例#11
0
    def main(self):
        if len(self.args.files) < 2:
            self.argparser.error('You must specify at least two files to stack.')

        if self.args.group_by_filenames:
            groups = [os.path.split(f.name)[1] for f in self.args.files] 
        elif self.args.groups:
            groups = self.args.groups.split(',')

            if len(groups) != len(self.args.files):
                self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.')
        else:
            groups = None
                
        group_name = self.args.group_name if self.args.group_name else 'group'

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for i, f in enumerate(self.args.files):
            rows = CSVKitReader(f, **self.reader_kwargs)
            headers = rows.next()

            if i == 0:
                if groups:
                    headers.insert(0, group_name)
                
                output.writerow(headers)

            for row in rows:
                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)
示例#12
0
    def main(self):
        self.input_files = []

        for path in self.args.input_paths:
            self.input_files.append(self._open_input_file(path))

        if len(self.input_files) < 2:
            self.argparser.error('You must specify at least two files to stack.')

        if self.args.group_by_filenames:
            groups = [os.path.split(f.name)[1] for f in self.input_files] 
        elif self.args.groups:
            groups = self.args.groups.split(',')

            if len(groups) != len(self.input_files):
                self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.')
        else:
            groups = None
                
        group_name = self.args.group_name if self.args.group_name else 'group'

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for i, f in enumerate(self.input_files):
            rows = CSVKitReader(f, **self.reader_kwargs)

            # If we have header rows, use them
            if not self.args.no_header_row:
                headers = next(rows, [])

                if i == 0:
                    if groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)
            # If we don't generate simple column names based on first row
            else:
                row = next(rows, [])

                headers = make_default_headers(len(row))

                if i == 0:
                    if groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)

                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)

            for row in rows:
                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)

            f.close()
示例#13
0
    def main(self):
        reader = CSVKitReader(self.args.file, **self.reader_kwargs)

        if self.args.dryrun:
            checker = RowChecker(reader)

            for row in checker.checked_rows():
                pass

            if checker.errors:
                for e in checker.errors:
                    self.output_file.write("Line %i: %s\n" % (e.line_number, e.msg))
            else:
                self.output_file.write("No errors.\n")

            if checker.joins:
                self.output_file.write(
                    "%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n"
                    % (checker.rows_joined, checker.joins)
                )
        else:
            base, ext = splitext(self.args.file.name)

            with open("%s_out.csv" % base, "w") as f:
                clean_writer = CSVKitWriter(f, **self.writer_kwargs)

                checker = RowChecker(reader)
                clean_writer.writerow(checker.column_names)

                for row in checker.checked_rows():
                    clean_writer.writerow(row)

            if checker.errors:
                error_filename = "%s_err.csv" % base

                with open(error_filename, "w") as f:
                    error_writer = CSVKitWriter(f, **self.writer_kwargs)

                    error_header = ["line_number", "msg"]
                    error_header.extend(checker.column_names)
                    error_writer.writerow(error_header)

                    error_count = len(checker.errors)

                    for e in checker.errors:
                        error_writer.writerow(self._format_error_row(e))

                self.output_file.write(
                    "%i error%s logged to %s\n" % (error_count, "" if error_count == 1 else "s", error_filename)
                )
            else:
                self.output_file.write("No errors.\n")

            if checker.joins:
                self.output_file.write(
                    "%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n"
                    % (checker.rows_joined, checker.joins)
                )
示例#14
0
文件: xlsx.py 项目: GMADIGITAL/csvkit
def xlsx2csv(f, output=None, **kwargs):
    """
    Convert an Excel .xlsx file to csv.

    Note: Unlike other convertor's, this one allows output columns to contain mixed data types.
    Blank headers are also possible.
    """
    streaming = True if output else False

    if not streaming:
        output = StringIO()

    writer = CSVKitWriter(output)

    book = load_workbook(f, use_iterators=True, data_only=True)

    if 'sheet' in kwargs:
        sheet = book.get_sheet_by_name(kwargs['sheet'])
    else:
        sheet = book.get_active_sheet()

    for i, row in enumerate(sheet.iter_rows()):
        if i == 0:
            writer.writerow([c.value for c in row]) 
            continue

        out_row = []

        for c in row:
            value = c.value

            if value.__class__ is datetime.datetime:
                # Handle default XLSX date as 00:00 time 
                if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c):
                    value = value.time() 

                    value = normalize_datetime(value)
                elif value.time() == NULL_TIME:
                    value = value.date()
                else:
                    value = normalize_datetime(value)
            elif value.__class__ is float:
                if value % 1 == 0:
                    value = int(value)

            if value.__class__ in (datetime.datetime, datetime.date, datetime.time):
                value = value.isoformat()

            out_row.append(value)

        writer.writerow(out_row)

    if not streaming:
        data = output.getvalue()
        return data

    # Return empty string when streaming
    return ''
示例#15
0
文件: cli.py 项目: pombredanne/csvsed
 def main(self):
   reader = CSVKitReader(self.args.file, **self.reader_kwargs)
   cnames = reader.next()
   cids   = parse_column_identifiers(self.args.columns, cnames, self.args.zero_based)
   mods   = {idx: self.args.expr for idx in cids}
   output = CSVKitWriter(self.output_file, **self.writer_kwargs)
   reader = sed.CsvFilter(reader, mods, header=False)
   output.writerow(cnames)
   for row in reader:
     output.writerow(row)
示例#16
0
    def main(self):
        self.input_files = []

        for path in self.args.input_paths:
            self.input_files.append(self._open_input_file(path))

        if len(self.input_files) < 2:
            self.argparser.error('You must specify at least two files to stack.')

        if self.args.group_by_filenames:
            groups = [os.path.split(f.name)[1] for f in self.input_files] 
        elif self.args.groups:
            groups = self.args.groups.split(',')

            if len(groups) != len(self.input_files):
                self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.')
        else:
            groups = None
                
        group_name = self.args.group_name if self.args.group_name else 'group'

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for i, f in enumerate(self.input_files):
            rows = CSVKitReader(f, **self.reader_kwargs)
            #headers = next(rows, [])

            # If we have header rows, use them
            if not self.args.no_header_row:
                headers = next(rows, [])

                if i == 0:
                    if groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)
            # If we don't generate simple column names based on first row
            else:
                row = next(rows, [])

                headers = make_default_headers(len(row))

                if i == 0:
                    if groups:
                        headers.insert(0, group_name)

                    output.writerow(headers)

                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)

            for row in rows:
                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)

            f.close()
示例#17
0
文件: table.py 项目: gepuro/csvkit
    def to_csv(self, output, **kwargs):
        """
        Serializes the table to CSV and writes it to any file-like object.
        """
        rows = self.to_rows(serialize_dates=True)

        # Insert header row
        rows.insert(0, self.headers())

        writer = CSVKitWriter(output, **kwargs)
        writer.writerows(rows)
示例#18
0
    def to_csv(self, output, **kwargs):
        """
        Serializes the table to CSV and writes it to any file-like object.
        """
        rows = self.to_rows(serialize_dates=True)

        # Insert header row
        rows.insert(0, self.headers())

        writer = CSVKitWriter(output, **kwargs)
        writer.writerows(rows)
示例#19
0
文件: xlsx.py 项目: higs4281/csvkit
def xlsx2csv(f, output=None, **kwargs):
    """
    Convert an Excel .xlsx file to csv.

    Note: Unlike other convertor's, this one allows output columns to contain mixed data types.
    Blank headers are also possible.
    """
    streaming = True if output else False

    if not streaming:
        output = StringIO()

    writer = CSVKitWriter(output)

    book = load_workbook(f, use_iterators=True)
    sheet = book.get_active_sheet()

    for i, row in enumerate(sheet.iter_rows()):
        if i == 0:
            writer.writerow([c.internal_value for c in row])
            continue

        out_row = []

        for c in row:
            value = c.internal_value

            if value.__class__ is datetime.datetime:
                if value.time() != NULL_TIME:
                    value = normalize_datetime(value)
                else:
                    value = value.date()
            elif value.__class__ is float:
                if value % 1 == 0:
                    value = int(value)

            if value.__class__ in (datetime.datetime, datetime.date, datetime.time):
                value = value.isoformat()

            if value == "\0":
                continue

            out_row.append(value)

        writer.writerow(out_row)

    if not streaming:
        data = output.getvalue()
        return data

    # Return empty string when streaming
    return ""
示例#20
0
文件: csvpys.py 项目: cypreess/csvkit
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.script:
            self.argparser.error("At least one script -s must be defined.")

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        script_reader = ScriptCSVReader(rows, scripts=self.args.script, zero_based=self.args.zero_based)
        for i, row in enumerate(script_reader):
            output.writerow(row)
示例#21
0
    def main(self):
        rows = CSVKitReader(self.args.file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = rows.next()

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = rows.next()

        column_names = self.args.columns.split(',')

        part_count = 0
        output = CSVKitWriter( open(self.args.file._lazy_args[0]+".part.%d" % part_count, 'w'), **self.writer_kwargs)
        output.writerow(column_names)

        count = 0
        for row in rows:
            if (self.args.lines > 0) and (count == self.args.lines):
                part_count += 1
                count = 0
                # couldn't find a better way to close the file
                del output
                output = CSVKitWriter( open(self.args.file._lazy_args[0]+".part.%d" % part_count, 'w'), **self.writer_kwargs)
                output.writerow(column_names)

            output.writerow(row)
            count += 1
示例#22
0
文件: js.py 项目: yakxxx/csvkit
def json2csv(f, key=None, **kwargs):
    """
    Convert a JSON document into CSV format.

    The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list.
    """
    document = f.read()
    js = json.loads(document)

    if isinstance(js, dict):
        if not key:
            raise TypeError('When converting a JSON document with a top-level dictionary element, a key must be specified.')
        
        js = js[key]

    if not isinstance(js, list):
        raise TypeError('Only JSON documents with a top-level list element are able to be converted (or a top-level dictionary if specifying a key).')

    field_set = set()
    flat = []

    for obj in js:
        flat.append(parse_object(obj)) 

    for obj in flat:
        field_set.update(obj.keys())

    fields = sorted(list(field_set))

    o = six.StringIO()
    writer = CSVKitWriter(o, *kwargs)

    writer.writerow(fields)

    for i in flat:
        row = []

        for field in fields:
            if field in i:
                row.append(i[field])
            else:
                row.append(None)

        writer.writerow(row)

    output = o.getvalue()
    o.close()

    return output
示例#23
0
 def main(self):
     tabname = os.path.splitext(
         os.path.basename(self.args.file._lazy_args[0]))[0]
     tab = table.Table.from_csv(self.args.file,
                                name=tabname,
                                **self.reader_kwargs)
     stmt = make_create_table_statement(make_table(tab), dialect='sqlite')
     conn = sqlite3.connect(':memory:')
     c = conn.cursor()
     c.execute(stmt)
     for row in tab.to_rows():
         vals = ','.join(['?'] * len(row))
         prepared = "INSERT INTO %s VALUES(%s)" % (tab.name, vals)
         c.execute(prepared, row)
     output = CSVKitWriter(self.output_file, **self.writer_kwargs)
     for row in c.execute(self.args.query):
         output.writerow(row)
示例#24
0
文件: xlsx.py 项目: MinoGames/csvkit
def sheet2csv(sheet, output=None):

    streaming = True if output else False

    if not streaming:
        output = six.StringIO()

    writer = CSVKitWriter(output)

    for i, row in enumerate(sheet.iter_rows()):
        if i == 0:
            writer.writerow([c.value for c in row])
            continue

        out_row = []

        for c in row:
            value = c.value

            if value.__class__ is datetime.datetime:
                # Handle default XLSX date as 00:00 time
                if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c):
                    value = value.time() 

                    value = normalize_datetime(value)
                elif value.time() == NULL_TIME:
                    value = value.date()
                else:
                    value = normalize_datetime(value)
            elif value.__class__ is float:
                if value % 1 == 0:
                    value = int(value)

            if value.__class__ in (datetime.datetime, datetime.date, datetime.time):
                value = value.isoformat()

            out_row.append(value)

        writer.writerow(out_row)

    if not streaming:
        data = output.getvalue()
        return data

    # Return empty string when streaming
    return ''
示例#25
0
文件: js.py 项目: Trawnee/csvkit
def json2csv(f, key=None, **kwargs):
    """
    Convert a JSON document into CSV format.

    The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list.
    """
    js = json.load(f, object_pairs_hook=OrderedDict)

    if isinstance(js, dict):
        if not key:
            raise TypeError('When converting a JSON document with a top-level dictionary element, a key must be specified.')
        
        js = js[key]

    if not isinstance(js, list):
        raise TypeError('Only JSON documents with a top-level list element are able to be converted (or a top-level dictionary if specifying a key).')

    fields = []
    flat = []

    for obj in js:
        flat.append(parse_object(obj)) 

    for obj in js:
        for key in obj.keys():
            if key not in fields:
                fields.append(key)

    o = six.StringIO()
    writer = CSVKitWriter(o)

    writer.writerow(fields)

    for i in flat:
        row = []

        for field in fields:
            row.append(i.get(field, None))

        writer.writerow(row)

    output = o.getvalue()
    o.close()

    return output
示例#26
0
    def main(self):
        rows = CSVKitReader(self.input_file, **self.reader_kwargs)
        if self.args.no_header_row:
            row = next(rows)
            column_names = make_default_headers(len(row))
            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(None, column_names, self.args.zero_based)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        # write header
        output.writerow([column_names[c] for c in column_ids])
        def float_or_else(x):
            try: return float(x)
            except ValueError: return x
        if self.args.filter_expr:
            for row in rows:
                d = {i:float_or_else(j) for i,j in zip(column_names,row)} 
                if eval(self.args.filter_expr,d): 
                    out_row = [row[c] if c < len(row) else None for c in column_ids]
                    output.writerow(out_row)
        elif self.args.not_filter_expr:
            for row in rows:
                d = {i:float_or_else(j) for i,j in zip(column_names,row)} 
                if not eval(self.args.not_filter_expr,d): 
                    out_row = [row[c] if c < len(row) else None for c in column_ids]
                    output.writerow(out_row)
示例#27
0
文件: js.py 项目: plipp/csvkit
def json2csv(f, field=None, **kwargs):
    """
    Convert a JSON document into CSV format.

    The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list.
    """
    js = json.load(f, object_pairs_hook=OrderedDict)

    if isinstance(js, dict):
        if not field:
            raise TypeError('When converting a JSON document with a top-level dictionary element, a key must be specified.')

        js = js[field]

    fields = []
    flats = []

    for obj in js:
        (flat_fields, flat_key_values_dict) = parse_object(obj)
        flats.append(flat_key_values_dict)

        for field in flat_fields:
            if field not in fields:
                fields.append(field)

    o = six.StringIO()
    writer = CSVKitWriter(o)

    writer.writerow(fields)

    for i in flats:
        row = []

        for field in fields:
            row.append(i.get(field, None))

        writer.writerow(row)

    output = o.getvalue()
    o.close()

    return output
示例#28
0
def download_to_csv(event_id):

    f = StringIO()
    writer = CSVKitWriter(f)

    # List headers
    event = Event.query.get(event_id)
    headers = []
    for question in event.questions:
        headers.append(question.label)
    writer.writerow(headers)
    
    # List entries for each registration
    for registration in event.registrations:
        data = []
        for answer in registration.answers:
            data.append(answer.value)
        writer.writerow(data)

    return Response(f.getvalue(), mimetype='text/csv')
示例#29
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        for i, row in enumerate(rows):
            out_row = [row[c] if c < len(row) else None for c in column_ids] 

            if self.args.delete_empty:
                if ''.join(out_row) == '':
                    continue
            
            output.writerow(out_row)
示例#30
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.columns:
            self.argparser.error('You must specify at least one column to search using the -c option.')

        if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None:
            self.argparser.error('One of -r, -m or -f must be specified, unless using the -n option.')

        rows = CSVKitReader(self.input_file, **self.reader_kwargs)
        column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based)

        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = set(line.rstrip() for line in self.args.matchfile)
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern

        patterns = dict((c, pattern) for c in column_ids)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        output.writerow(column_names)

        filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse)

        for row in filter_reader:
            output.writerow(row)
示例#31
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        rows = CSVKitReader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names,
                                              self.args.zero_based,
                                              self.args.not_columns)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        for row in rows:
            out_row = [row[c] if c < len(row) else None for c in column_ids]

            if self.args.delete_empty:
                if ''.join(out_row) == '':
                    continue

            output.writerow(out_row)
示例#32
0
    def main(self):
        rows = CSVKitReader(self.input_file, **self.reader_kwargs)
        if self.args.no_header_row:
            row = next(rows)
            column_names = make_default_headers(len(row))
            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(None, column_names,
                                              self.args.zero_based)
        uniq_column_id = parse_column_identifiers(self.args.uniq_column,
                                                  column_names,
                                                  self.args.zero_based)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        output.writerow([column_names[c] for c in column_ids])
        d = set()  # cache for used-rows
        # use tuple as keys for cache
        cache_key = lambda row: tuple([row[i] for i in uniq_column_id])
        for row in rows:
            if cache_key(row) in d: continue
            d.update([cache_key(row)])
            out_row = [row[c] if c < len(row) else None for c in column_ids]
            output.writerow(out_row)
示例#33
0
    def main(self):
        rows = CSVKitReader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(self.args.columns, column_names,
                                              self.args.zero_based,
                                              self.args.not_columns)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        drop_white = lambda i: re.sub('\s+$', '', re.sub('^\s+', '', i))
        for row in rows:
            out_row = [
                drop_white(row[c]) if c < len(row) else None
                for c in column_ids
            ]
            output.writerow(out_row)
示例#34
0
文件: ndjs.py 项目: gepuro/csvkit
def ndjson2csv(f, key=None, **kwargs):
    """
    Convert a JSON document into CSV format.

    Supports both JSON and "Newline-delimited JSON".

    The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list.
    """
    first_line = f.readline()

    first_row = json.loads(first_line, object_pairs_hook=OrderedDict)
    js = itertools.chain((first_row, ), (json.loads(l, object_pairs_hook=OrderedDict) for l in f))

    fields = []
    flat = []

    for obj in js:
        flat.append(parse_object(obj)) 

        for key in obj.keys():
            if key not in fields:
                fields.append(key)

    o = six.StringIO()
    writer = CSVKitWriter(o)

    writer.writerow(fields)

    for i in flat:
        row = []

        for field in fields:
            row.append(i.get(field, None))

        writer.writerow(row)

    output = o.getvalue()
    o.close()

    return output
示例#35
0
    def main(self):
        if len(self.args.files) < 2:
            sys.exit('You must specify at least two files to stack.')

        if self.args.group_by_filenames:
            groups = [os.path.split(f.name)[1] for f in self.args.files]
        elif self.args.groups:
            groups = self.args.groups.split(',')

            if len(groups) != len(self.args.files):
                sys.exit(
                    'The number of grouping values must be equal to the number of CSV files being stacked.'
                )
        else:
            groups = None

        group_name = self.args.group_name if self.args.group_name else 'group'

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        for i, f in enumerate(self.args.files):
            rows = CSVKitReader(f, **self.reader_kwargs)
            headers = rows.next()

            if i == 0:
                if groups:
                    headers.insert(0, group_name)

                output.writerow(headers)

            for row in rows:
                if groups:
                    row.insert(0, groups[i])

                output.writerow(row)
示例#36
0
    def main(self):
        reader = CSVKitReader(self.args.file, **self.reader_kwargs)

        if self.args.dryrun:
            checker = RowChecker(reader)
            for row in checker.checked_rows():
                pass
            if checker.errs:
                for e in checker.errs:
                    self.output_file.write("Line %i: %s\n" % (e.line_number,e.msg))
            else:
                self.output_file.write("No errors.\n")
            if checker.joins:
                self.output_file.write("%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n" % (checker.rows_joined, checker.joins))
        else:
            base,ext = splitext(self.args.file.name)
            # should we preserve delimiters and other dialect args from CLI?
            cleaned_file = CSVKitWriter(open("%s_out.csv" % base,"w"), **self.writer_kwargs)

            checker = RowChecker(reader)
            cleaned_file.writerow(checker.column_names)
            for row in checker.checked_rows():
                cleaned_file.writerow(row)
            
            if checker.errs:
                # should we preserve delimiters and other dialect args from CLI?
                err_filename = "%s_err.csv" % base
                err_file = CSVKitWriter(open(err_filename, "w"), **self.writer_kwargs)
                err_header = ['line_number','msg']
                err_header.extend(checker.column_names)
                err_file.writerow(err_header)
                for e in checker.errs:
                    err_file.writerow(self._format_error_row(e))
                    err_count = len(checker.errs)
                self.output_file.write("%i error%s logged to %s\n" % (err_count,"" if err_count == 1 else "s", err_filename))
            else:
                self.output_file.write("No errors.\n")

            if checker.joins:
                self.output_file.write("%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n" % (checker.rows_joined, checker.joins))
示例#37
0
    def main(self):
        rows = CSVKitReader(self.input_file, **self.reader_kwargs)

        if self.args.no_header_row:
            row = next(rows)

            column_names = make_default_headers(len(row))

            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        all_column_ids = parse_column_identifiers(None,column_names, self.args.zero_based, self.args.not_columns)
        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in all_column_ids])
        d = {} # namespace dict for map_expr
        exec "def f(x): return %s"%(self.args.map_expr) in d

        for row in rows:
            out_row = []
            for c in all_column_ids:
                if c in column_ids:
                    out_row.append(d['f'](row[c]) if c <len(row) else None) 
                else:
                    out_row.append(row[c] if c <len(row) else None) 
            output.writerow(out_row)
示例#38
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        if not self.args.regex and not self.args.pattern and not self.args.matchfile:
            self.argparser.error("One of -r, -m or -f must be specified, unless using the -n option.")

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based)
        
        if self.args.regex:
            pattern = re.compile(self.args.regex)
        elif self.args.matchfile:
            lines = [line.rstrip() for line in self.args.matchfile]
            pattern = lambda x: x in lines
        else:
            pattern = self.args.pattern
            
        patterns = dict((c, pattern) for c in column_ids)

        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        output.writerow(column_names)

        filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse)

        for i, row in enumerate(filter_reader):
            output.writerow(row)
示例#39
0
def xlsx2csv(f, output=None, **kwargs):
    """
    Convert an Excel .xlsx file to csv.

    Note: Unlike other convertor's, this one allows output columns to contain mixed data types.
    Blank headers are also possible.
    """
    streaming = True if output else False

    if not streaming:
        output = six.StringIO()

    writer = CSVKitWriter(output)

    book = load_workbook(f, use_iterators=True, data_only=True)

    if 'sheet' in kwargs:
        sheet = book.get_sheet_by_name(kwargs['sheet'])
    else:
        sheet = book.get_active_sheet()

    for i, row in enumerate(sheet.iter_rows()):
        if i == 0:
            writer.writerow([c.value for c in row])
            continue

        out_row = []

        for c in row:
            value = c.value

            if value.__class__ is datetime.datetime:
                # Handle default XLSX date as 00:00 time
                if value.date() == datetime.date(
                        1904, 1, 1) and not has_date_elements(c):
                    value = value.time()

                    value = normalize_datetime(value)
                elif value.time() == NULL_TIME:
                    value = value.date()
                else:
                    value = normalize_datetime(value)
            elif value.__class__ is float:
                if value % 1 == 0:
                    value = int(value)

            if value.__class__ in (datetime.datetime, datetime.date,
                                   datetime.time):
                value = value.isoformat()

            out_row.append(value)

        writer.writerow(out_row)

    if not streaming:
        data = output.getvalue()
        return data

    # Return empty string when streaming
    return ''
示例#40
0
def run(*args):

    avibase_index = 'http://avibase.bsc-eoc.org/checklist.jsp?list=eBird'
    list_name = args[0]
    language = synonyms[args[1]]

    response = urllib2.urlopen(avibase_index)
    html = lxml.html.fromstring(response.read())
    links = html.cssselect('.AVBregions td a')

    names = []

    for link in links:
        if link.text == list_name:
            checklist_url = 'http://avibase.bsc-eoc.org/' + link.attrib['href']
            if language != 'EN':
                checklist_url += '&synlang=%s' % language
            response = urllib2.urlopen(checklist_url)
            html = lxml.html.fromstring(response.read())

            for row in html.cssselect('.AVBlist tr.highlight1'):
                cells = row.cssselect('td')
                scientific_name = cells[1].cssselect('i')[0].text
                if language == 'EN':
                    common_name = cells[0].text
                else:
                    common_name = cells[2].text
                if common_name:
                    common_name = common_name.encode('latin_1').decode('utf-8')
                else:
                    common_name = scientific_name
                names.append((scientific_name, common_name))

    suffix = language.lower()

    with open('species_names_%s.csv' % suffix, 'wb') as fp:
        writer = CSVKitWriter(fp)
        writer.writerow(('scientific_name', 'common_name_%s' % suffix))
        for name in names:
            writer.writerow(name)
示例#41
0
    def log_errors(self, rows):
        """
        Log any errors to a csv file
        """
        # Make sure the log directory exists
        os.path.exists(self.log_dir) or os.makedirs(self.log_dir)

        # Log writer
        log_path = os.path.join(
            self.log_dir,
            self.file_name.lower().replace("tsv", "errors.csv")
        )
        log_file = open(log_path, 'w')
        log_writer = CSVKitWriter(log_file, quoting=csv.QUOTE_ALL)

        # Add the headers
        log_writer.writerow([
            'Line number',
            'Headers len',
            'Fields len',
            'Line value'
        ])

        # Log out the rows
        log_writer.writerows(rows)

        # Shut it down
        log_file.close()
示例#42
0
    def main(self):
        try:
            engine, metadata = sql.get_connection(self.args.connection_string)
        except ImportError:
            raise ImportError(
                'You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use.. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n'
            )

        conn = engine.connect()

        if self.args.query:
            query = self.args.query.strip()
        else:
            query = ""

            for line in self.args.file:
                query += line

        rows = conn.execute(query)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        if not self.args.no_header_row:
            output.writerow(rows._metadata.keys)
        for row in rows:
            output.writerow(row)

        conn.close()
 def handle(self, *args, **options):
     self.cursor = connection.cursor()
     sql = """
     SELECT DISTINCT
         o.name,
         o.seat,
         f.filer_id_raw,
         f.xref_filer_id,
         f.name,
         f.party
     FROM %(candidate)s as c
     INNER JOIN %(office)s as o
     ON c.office_id = o.id
     INNER JOIN %(filer)s as f
     ON c.filer_id = f.id
     """ % dict(
         candidate=models.Candidate._meta.db_table,
         office=models.Office._meta.db_table,
         filer=models.Filer._meta.db_table,
     )
     self.cursor.execute(sql)
     writer = CSVKitWriter(open("./candidates.csv", 'wb'))
     writer.writerow([
         'office_name',
         'office_seat',
         'filer_id',
         'xref_filer_id',
         'name',
         'party'
     ])
     writer.writerows(self.cursor.fetchall())
示例#44
0
def geojson2csv(f, key=None, **kwargs):
    """
    Convert a GeoJSON document into CSV format.
    """
    js = json.load(f, object_pairs_hook=OrderedDict)

    if not isinstance(js, dict):
        raise TypeError('JSON document is not valid GeoJSON: Root element is not an object.')

    if 'type' not in js:
        raise TypeError('JSON document is not valid GeoJSON: No top-level "type" key.')

    if js['type'] != 'FeatureCollection':
        raise TypeError('Only GeoJSON with root FeatureCollection type is supported. Not %s' % js['type']) 

    if 'features' not in js:
        raise TypeError('JSON document is not a valid FeatureCollection: No top-level "features" key.')

    features = js['features']
    
    features_parsed = []    # tuples in the format (id, properties, geometry)
    property_fields = []

    for feature in features:
        geoid = feature.get('id', None)

        properties = feature.get('properties') or {}

        for prop in properties.keys():
            if prop not in property_fields:
                property_fields.append(prop)

        geometry = json.dumps(feature['geometry'])

        features_parsed.append((geoid, properties, geometry))

    header = ['id']
    header.extend(property_fields)
    header.append('geojson')

    o = six.StringIO()
    writer = CSVKitWriter(o)

    writer.writerow(header)

    for geoid, properties, geometry in features_parsed:
        row = [geoid]

        for field in property_fields:
            row.append(properties.get(field, None))

        row.append(geometry)

        writer.writerow(row)

    output = o.getvalue()
    o.close()

    return output
示例#45
0
 def main(self):
     reader = CSVKitReader(self.args.file, **self.reader_kwargs)
     cnames = reader.next()
     cids = parse_column_identifiers(self.args.columns, cnames,
                                     self.args.zero_based)
     mods = {idx: self.args.expr for idx in cids}
     output = CSVKitWriter(self.output_file, **self.writer_kwargs)
     reader = sed.CsvFilter(reader, mods, header=False)
     output.writerow(cnames)
     for row in reader:
         output.writerow(row)
示例#46
0
文件: xlsx.py 项目: higs4281/csvkit
def xlsx2csv(f, output=None, **kwargs):
    """
    Convert an Excel .xlsx file to csv.

    Note: Unlike other convertor's, this one allows output columns to contain mixed data types.
    Blank headers are also possible.
    """
    streaming = True if output else False

    if not streaming:
        output = StringIO()

    writer = CSVKitWriter(output)

    book = load_workbook(f, use_iterators=True)
    sheet = book.get_active_sheet()

    for i, row in enumerate(sheet.iter_rows()):
        if i == 0:
            writer.writerow([c.internal_value for c in row])
            continue

        out_row = []

        for c in row:
            value = c.internal_value

            if value.__class__ is datetime.datetime:
                if value.time() != NULL_TIME:
                    value = normalize_datetime(value)
                else:
                    value = value.date()
            elif value.__class__ is float:
                if value % 1 == 0:
                    value = int(value)

            if value.__class__ in (datetime.datetime, datetime.date,
                                   datetime.time):
                value = value.isoformat()

            if value == "\0":
                continue

            out_row.append(value)

        writer.writerow(out_row)

    if not streaming:
        data = output.getvalue()
        return data

    # Return empty string when streaming
    return ''
示例#47
0
文件: js.py 项目: bucweat/csvkit
def json2csv(f, key=None, **kwargs):
    """
    Convert a JSON document into CSV format.

    The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list.
    """
    document = f.read()
    js = json.loads(document)

    if isinstance(js, dict):
        if not key:
            raise TypeError(
                'When converting a JSON document with a top-level dictionary element, a key must be specified.'
            )

        js = js[key]

    if not isinstance(js, list):
        raise TypeError(
            'Only JSON documents with a top-level list element are able to be converted (or a top-level dictionary if specifying a key).'
        )

    field_set = set()
    flat = []

    for obj in js:
        flat.append(parse_object(obj))

    for obj in flat:
        field_set.update(obj.keys())

    fields = sorted(list(field_set))

    o = six.StringIO()
    writer = CSVKitWriter(o)

    writer.writerow(fields)

    for i in flat:
        row = []

        for field in fields:
            if field in i:
                row.append(i[field])
            else:
                row.append(None)

        writer.writerow(row)

    output = o.getvalue()
    o.close()

    return output
示例#48
0
def json2csv(f, key=None, **kwargs):
    """
    Convert a JSON document into CSV format.

    The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list.
    """
    js = json.load(f, object_pairs_hook=OrderedDict)

    if isinstance(js, dict):
        if not key:
            raise TypeError(
                'When converting a JSON document with a top-level dictionary element, a key must be specified.'
            )

        js = js[key]

    if not isinstance(js, list):
        raise TypeError(
            'Only JSON documents with a top-level list element are able to be converted (or a top-level dictionary if specifying a key).'
        )

    fields = []
    flat = []

    for obj in js:
        flat.append(parse_object(obj))

    for obj in js:
        for key in obj.keys():
            if key not in fields:
                fields.append(key)

    o = six.StringIO()
    writer = CSVKitWriter(o)

    writer.writerow(fields)

    for i in flat:
        row = []

        for field in fields:
            row.append(i.get(field, None))

        writer.writerow(row)

    output = o.getvalue()
    o.close()

    return output
示例#49
0
    def main(self):
        rows = CSVKitReader(self.input_file, **self.reader_kwargs)
        if self.args.no_header_row:
            row = next(rows)
            column_names = make_default_headers(len(row))
            # Put the row back on top
            rows = itertools.chain([row], rows)
        else:
            column_names = next(rows)

        column_ids = parse_column_identifiers(None, column_names,
                                              self.args.zero_based)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)
        # write header
        output.writerow([column_names[c] for c in column_ids])

        def float_or_else(x):
            try:
                return float(x)
            except ValueError:
                return x

        if self.args.filter_expr:
            for row in rows:
                d = {i: float_or_else(j) for i, j in zip(column_names, row)}
                if eval(self.args.filter_expr, d):
                    out_row = [
                        row[c] if c < len(row) else None for c in column_ids
                    ]
                    output.writerow(out_row)
        elif self.args.not_filter_expr:
            for row in rows:
                d = {i: float_or_else(j) for i, j in zip(column_names, row)}
                if not eval(self.args.not_filter_expr, d):
                    out_row = [
                        row[c] if c < len(row) else None for c in column_ids
                    ]
                    output.writerow(out_row)
示例#50
0
    def writeCsv(self, filename=None):
        rows = self.getRows()

        if filename == None:
            filename = self.name + '.csv'

        # Output result of last query as CSV
        row_count = 0
        with open(filename, 'wb') as out:
            output = CSVKitWriter(out)
            output.writerow(rows._metadata.keys)
            for row in rows:
                output.writerow(row)
                row_count += 1

        tool.VERBOSE('wrote {} row(s) to csv {}', row_count, filename)
示例#51
0
def ndjson2csv(f, key=None, **kwargs):
    """
    Convert a JSON document into CSV format.

    Supports both JSON and "Newline-delimited JSON".

    The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list.
    """
    first_line = f.readline()

    first_row = json.loads(first_line, object_pairs_hook=OrderedDict)
    js = itertools.chain(
        (first_row, ),
        (json.loads(l, object_pairs_hook=OrderedDict) for l in f))

    fields = []
    flat = []

    for obj in js:
        flat.append(parse_object(obj))

        for key in obj.keys():
            if key not in fields:
                fields.append(key)

    o = six.StringIO()
    writer = CSVKitWriter(o)

    writer.writerow(fields)

    for i in flat:
        row = []

        for field in fields:
            row.append(i.get(field, None))

        writer.writerow(row)

    output = o.getvalue()
    o.close()

    return output
示例#52
0
def download_to_csv(event_id):
    """Downloads the registration data to a CSV"""

    f = StringIO()
    writer = CSVKitWriter(f)

    # List headers
    event = Event.query.get(event_id)
    headers = []
    for question in event.questions:
        headers.append(question.label)
    writer.writerow(headers)

    # List entries for each registration
    for registration in event.registrations:
        data = []
        sorted_answers = sorted(registration.answers,
                                key=lambda a: a.question.ordinal)
        for answer in sorted_answers:
            data.append(answer.value)
        writer.writerow(data)

    return Response(f.getvalue(), mimetype='text/csv')
示例#53
0
    def main(self):
        if self.args.names_only:
            self.print_column_names()
            return

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        column_ids = parse_column_identifiers(self.args.columns, column_names,
                                              self.args.zero_based,
                                              self.args.not_columns)
        output = CSVKitWriter(self.output_file, **self.writer_kwargs)

        output.writerow([column_names[c] for c in column_ids])

        for i, row in enumerate(rows):
            out_row = [row[c] if c < len(row) else None for c in column_ids]

            if self.args.delete_empty:
                if ''.join(out_row) == '':
                    continue

            output.writerow(out_row)
    def clean(self, name):
        """
        Cleans the provided source TSV file and writes it out in CSV format
        """
        if self.verbosity > 2:
            self.log(" Cleaning %s" % name)

        # Up the CSV data limit
        csv.field_size_limit(1000000000)

        # Input and output paths
        tsv_path = os.path.join(self.tsv_dir, name)
        csv_path = os.path.join(self.csv_dir,
                                name.lower().replace("tsv", "csv"))

        # Writer
        csv_file = open(csv_path, 'w')
        csv_writer = CSVKitWriter(csv_file, quoting=csv.QUOTE_ALL)

        # Reader
        tsv_file = open(tsv_path, 'rb')

        # Pull and clean the headers
        try:
            headers = tsv_file.readline()
        except StopIteration:
            return
        headers = headers.decode("ascii", "replace")
        headers_csv = CSVKitReader(StringIO(headers), delimiter=str('\t'))
        try:
            headers_list = next(headers_csv)
        except StopIteration:
            return
        headers_count = len(headers_list)
        csv_writer.writerow(headers_list)

        log_rows = []

        # Loop through the rest of the data
        line_number = 1
        for tsv_line in tsv_file:

            # Goofing around with the encoding while we're in there.
            tsv_line = tsv_line.decode("ascii", "replace")
            if six.PY2:
                tsv_line = tsv_line.replace('\ufffd', '?')

            # Nuke any null bytes
            null_bytes = tsv_line.count('\x00')
            if null_bytes:
                tsv_line = tsv_line.replace('\x00', ' ')

            # Nuke ASCII 26 char, the "substitute character"
            # or chr(26) in python
            sub_char = tsv_line.count('\x1a')
            if sub_char:
                tsv_line = tsv_line.replace('\x1a', '')

            # Split on tabs so we can later spit it back out as CSV
            # and remove extra newlines while we are there.
            csv_field_list = tsv_line.replace("\r\n", "").split("\t")

            # Check if our values line up with our headers
            # and if not, see if CSVkit can sort out the problems
            if not len(csv_field_list) == headers_count:
                csv_field_list = next(
                    CSVKitReader(StringIO(tsv_line), delimiter=str('\t')))
                if not len(csv_field_list) == headers_count:
                    if self.verbosity > 2:
                        msg = '  Bad parse of line %s (%s headers, %s values)'
                        self.failure(msg % (line_number, len(headers_list),
                                            len(csv_field_list)))
                    log_rows.append([
                        line_number,
                        len(headers_list),
                        len(csv_field_list), ','.join(csv_field_list)
                    ])
                    continue

            # Write out the row
            csv_writer.writerow(csv_field_list)
            line_number += 1

        # Log errors if there are any
        if log_rows:
            if self.verbosity > 1:
                msg = '  %s errors'
                self.failure(msg % (len(log_rows) - 1))
            self.log_errors(name, log_rows)

        # Shut it down
        tsv_file.close()
        csv_file.close()
示例#55
0
    def main(self):
        reader = CSVKitReader(self.input_file, **self.reader_kwargs)

        if self.args.dryrun:
            checker = RowChecker(reader)

            for row in checker.checked_rows():
                pass

            if checker.errors:
                for e in checker.errors:
                    self.output_file.write('Line %i: %s\n' %
                                           (e.line_number, e.msg))
            else:
                self.output_file.write('No errors.\n')

            if checker.joins:
                self.output_file.write(
                    '%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n'
                    % (checker.rows_joined, checker.joins))
        else:
            base, ext = splitext(self.input_file.name)

            with open('%s_out.csv' % base, 'w') as f:
                clean_writer = CSVKitWriter(f, **self.writer_kwargs)

                checker = RowChecker(reader)
                clean_writer.writerow(checker.column_names)

                for row in checker.checked_rows():
                    clean_writer.writerow(row)

            if checker.errors:
                error_filename = '%s_err.csv' % base

                with open(error_filename, 'w') as f:
                    error_writer = CSVKitWriter(f, **self.writer_kwargs)

                    error_header = ['line_number', 'msg']
                    error_header.extend(checker.column_names)
                    error_writer.writerow(error_header)

                    error_count = len(checker.errors)

                    for e in checker.errors:
                        error_writer.writerow(self._format_error_row(e))

                self.output_file.write(
                    '%i error%s logged to %s\n' %
                    (error_count, '' if error_count == 1 else 's',
                     error_filename))
            else:
                self.output_file.write('No errors.\n')

            if checker.joins:
                self.output_file.write(
                    '%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n'
                    % (checker.rows_joined, checker.joins))
示例#56
0
    def main(self):
        reader = CSVKitReader(self.args.file, **self.reader_kwargs)

        if self.args.dryrun:
            checker = RowChecker(reader)
            for row in checker.checked_rows():
                pass
            if checker.errs:
                for e in checker.errs:
                    self.output_file.write("Line %i: %s\n" %
                                           (e.line_number, e.msg))
            else:
                self.output_file.write("No errors.\n")
            if checker.joins:
                self.output_file.write(
                    "%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n"
                    % (checker.rows_joined, checker.joins))
        else:
            base, ext = splitext(self.args.file.name)
            # should we preserve delimiters and other dialect args from CLI?
            cleaned_file = CSVKitWriter(open("%s_out.csv" % base, "w"),
                                        **self.writer_kwargs)

            checker = RowChecker(reader)
            cleaned_file.writerow(checker.column_names)
            for row in checker.checked_rows():
                cleaned_file.writerow(row)

            if checker.errs:
                # should we preserve delimiters and other dialect args from CLI?
                err_filename = "%s_err.csv" % base
                err_file = CSVKitWriter(open(err_filename, "w"),
                                        **self.writer_kwargs)
                err_header = ['line_number', 'msg']
                err_header.extend(checker.column_names)
                err_file.writerow(err_header)
                for e in checker.errs:
                    err_file.writerow(self._format_error_row(e))
                    err_count = len(checker.errs)
                self.output_file.write(
                    "%i error%s logged to %s\n" %
                    (err_count, "" if err_count == 1 else "s", err_filename))
            else:
                self.output_file.write("No errors.\n")

            if checker.joins:
                self.output_file.write(
                    "%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n"
                    % (checker.rows_joined, checker.joins))