def run(infile, options, report_step=10000): options.tablename = quoteTableName(options.tablename, backend=options.backend) if options.map: m = {} for x in options.map: f, t = x.split(":") m[f] = t options.map = m else: options.map = {} existing_tables = set() quick_import_separator = "\t" if options.database_backend == "postgres": import psycopg2 raise NotImplementedError("needs refactoring for commandline options") dbhandle = psycopg2.connect(options.psql_connection) error = psycopg2.Error options.null = "NULL" options.string_value = "'%s'" options.text = "TEXT" options.index = "TEXT" if options.insert_quick: raise ValueError("quick import not implemented.") elif options.database_backend == "mysql": import MySQLdb dbhandle = MySQLdb.connect(host=options.database_host, user=options.database_username, passwd=options.database_password, port=options.database_port, db=options.database_name) error = Exception options.null = "NULL" options.string_value = "%s" options.text = "TEXT" options.index = "VARCHAR(40)" if options.insert_quick: raise ValueError("quick import not implemented.") elif options.backend == "sqlite": import sqlite3 dbhandle = sqlite3.connect(options.database_name) try: os.chmod(options.database_name, 0o664) except OSError as msg: E.warn("could not change permissions of database: %s" % msg) # Avoid the following error: # sqlite3.ProgrammingError: You must not use 8-bit bytestrings # unless you use a text_factory that can interpret 8-bit # bytestrings (like text_factory = str). It is highly # recommended that you instead just switch your application # to Unicode strings # Note: might be better to make csv2db unicode aware. dbhandle.text_factory = str error = sqlite3.OperationalError options.insert_many = True # False options.null = None # "NULL" options.text = "TEXT" options.index = "TEXT" options.string_value = "%s" # "'%s'" statement = "SELECT name FROM sqlite_master WHERE type='table'" cc = executewait(dbhandle, statement, error, options.retry) existing_tables = set([x[0] for x in cc]) cc.close() # use , as separator quick_import_statement = \ "sqlite3 %s '.import %%s %s'" % \ (options.database_name, options.tablename) quick_import_separator = "|" if options.header is not None: options.header = [x.strip() for x in options.header.split(",")] if options.utf: reader = CSV.UnicodeDictReader(infile, dialect=options.dialect, fieldnames=options.header) else: reader = csv.DictReader(CSV.CommentStripper(infile), dialect=options.dialect, fieldnames=options.header) if options.replace_header: try: next(reader) except StopIteration: pass E.info("reading %i columns to guess column types" % options.guess_size) rows = [] for row in reader: if None in row: raise ValueError("undefined columns in input file at row: %s" % row) try: rows.append(IOTools.convertDictionary(row, map=options.map)) except TypeError as msg: E.warn("incomplete line? Type error in conversion: " "'%s' with data: %s" % (msg, str(row))) except ValueError as msg: E.warn("incomplete line? Type error in conversion: " "'%s' with data: %s" % (msg, str(row))) if len(rows) >= options.guess_size: break E.info("read %i rows for type guessing" % len(rows)) E.info("creating table") if len(rows) == 0: if options.allow_empty: if not reader.fieldnames: E.warn("no data - no table created") else: # create empty table and exit take, map_column2type, ignored = createTable( dbhandle, error, options.tablename, options, retry=options.retry, headers=reader.fieldnames, ignore_empty=options.ignore_empty, ignore_columns=options.ignore_columns, rename_columns=options.rename_columns, lowercase=options.lowercase, ignore_duplicates=options.ignore_duplicates, indices=options.indices, first_column=options.first_column, existing_tables=existing_tables, append=options.append) E.info("empty table created") return else: raise ValueError("empty table") else: take, map_column2type, ignored = createTable( dbhandle, error, options.tablename, options, rows=rows, retry=options.retry, headers=reader.fieldnames, ignore_empty=options.ignore_empty, ignore_columns=options.ignore_columns, rename_columns=options.rename_columns, lowercase=options.lowercase, ignore_duplicates=options.ignore_duplicates, indices=options.indices, first_column=options.first_column, existing_tables=existing_tables, append=options.append) def row_iter(rows, reader): for row in rows: yield quoteRow(row, take, map_column2type, options.missing_values, null=options.null, string_value=options.string_value) for data in reader: yield quoteRow(IOTools.convertDictionary(data, map=options.map), take, map_column2type, options.missing_values, null=options.null, string_value=options.string_value) ninput = 0 E.info("inserting data") if options.insert_quick: E.info("using quick insert") outfile, filename = tempfile.mkstemp() E.debug("dumping data into %s" % filename) for d in row_iter(rows, reader): ninput += 1 os.write( outfile, quick_import_separator.join([str(d[x]) for x in take]) + "\n") if ninput % report_step == 0: E.info("iteration %i\n" % ninput) os.close(outfile) statement = quick_import_statement % filename E.debug(statement) # infinite loop possible while 1: retcode = E.run(statement, cwd=os.getcwd(), close_fds=True) if retcode != 0: E.warn("import error using statement: %s" % statement) if not options.retry: raise ValueError("import error using statement: %s" % statement) time.sleep(5) continue break os.remove(filename) # there is no way to insert NULL values into sqlite. The only # solution is to update all colums. for column in take: executewait( dbhandle, "UPDATE %s SET %s = NULL WHERE %s = 'None'" % (options.tablename, column, column), error, options.retry) elif options.insert_many: data = [] for d in row_iter(rows, reader): ninput += 1 data.append([d[x] for x in take]) if ninput % report_step == 0: E.info("iteration %i" % ninput) statement = "INSERT INTO %s VALUES (%s)" % (options.tablename, ",".join("?" * len(take))) E.info("inserting %i rows" % len(data)) E.debug("multiple insert:\n# %s" % statement) while 1: try: dbhandle.executemany(statement, data) except error as msg: E.warn("import failed: msg=%s, statement=\n %s" % (msg, statement)) # TODO: check for database locked msg if not options.retry: raise error(msg) if not re.search("locked", str(msg)): raise error(msg) time.sleep(5) continue break else: # insert line by line (could not figure out how to do bulk loading with # subprocess and COPY FROM STDIN) statement = "INSERT INTO %s VALUES (%%(%s)s)" % (options.tablename, ')s, %('.join(take)) # output data used for guessing: for d in row_iter(rows, reader): ninput += 1 E.debug("single insert:\n# %s" % (statement % d)) cc = executewait(dbhandle, statement, error, retry=options.retry, args=d) cc.close() if ninput % report_step == 0: E.info("iteration %i" % ninput) E.info("building indices") nindex = 0 for index in options.indices: nindex += 1 try: statement = "CREATE INDEX %s_index%i ON %s (%s)" % ( options.tablename, nindex, options.tablename, index) cc = executewait(dbhandle, statement, error, options.retry) cc.close() E.info("added index on column %s" % (index)) except error as msg: E.info("adding index on column %s failed: %s" % (index, msg)) statement = "SELECT COUNT(*) FROM %s" % (options.tablename) cc = executewait(dbhandle, statement, error, options.retry) result = cc.fetchone() cc.close() noutput = result[0] E.info("ninput=%i, noutput=%i, nskipped_columns=%i" % (ninput, noutput, len(ignored))) dbhandle.commit()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: csv_cut.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-r", "--remove", dest="remove", action="store_true", help="remove specified columns, keep all others.") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.add_option( "-l", "--large", dest="large", action="store_true", help= "large columns. Do not use native python CSV module [default=%default]." ) parser.add_option("-f", "--filename-fields", dest="filename_fields", type="string", help="filename with field information.") parser.set_defaults( remove=False, unique=False, large=False, filename_fields=None, ) (options, args) = E.start(parser, add_csv_options=True, quiet=True) input_fields = args if options.filename_fields: input_fields = [ x[:-1].split("\t")[0] for x in [ x for x in IOTools.open_file(options.filename_fields, "r").readlines() if x[0] != "#" ] ] if options.unique: outfile = UniqueBuffer(options.stdout) else: outfile = options.stdout while 1: line = options.stdin.readline() if not line: E.stop() sys.exit(0) if line[0] == "#": continue first_line = line break old_fields = first_line[:-1].split("\t") fields = [] for f in input_fields: # do pattern search if f[0] == "%" and f[-1] == "%": pattern = re.compile(f[1:-1]) for o in old_fields: if pattern.search(o) and o not in fields: fields.append(o) else: if f in old_fields: fields.append(f) if options.remove: fields = set(fields) fields = [x for x in old_fields if x not in fields] if options.large: reader = CSV.DictReaderLarge(CSV.CommentStripper(options.stdin), fieldnames=old_fields, dialect=options.csv_dialect) else: reader = csv.DictReader(CSV.CommentStripper(options.stdin), fieldnames=old_fields, dialect=options.csv_dialect) writer = csv.DictWriter(outfile, fields, dialect=options.csv_dialect, lineterminator=options.csv_lineterminator, extrasaction='ignore') print("\t".join(fields)) first_row = True ninput, noutput, nerrors = 0, 0, 0 while 1: ninput += 1 try: row = six.next(reader) except _csv.Error as msg: options.stderr.write("# error while parsing: %s\n" % (msg)) nerrors += 1 continue except StopIteration: break if not row: break writer.writerow(row) noutput += 1 E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: csv_cut.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-r", "--remove", dest="remove", action="store_true", help="remove specified columns, keep all others.") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.add_option( "-l", "--large", dest="large", action="store_true", help= "large columns. Do not use native python CSV module [default=%default]." ) parser.add_option("-f", "--filename-fields", dest="filename_fields", type="string", help="filename with field information.") parser.set_defaults( remove=False, unique=False, filename_fields=None, ) (options, args) = E.start(parser, add_csv_options=True, quiet=True) statement = " ".join(args) if options.large: reader = CSV.DictReaderLarge(CSV.CommentStripper(sys.stdin), dialect=options.csv_dialect) else: reader = csv.DictReader(CSV.CommentStripper(sys.stdin), dialect=options.csv_dialect) exec("f = lambda r: %s" % statement, globals()) counter = E.Counter() writer = csv.DictWriter(options.stdout, reader.fieldnames, dialect=options.csv_dialect, lineterminator=options.csv_lineterminator) writer.writerow(dict((fn, fn) for fn in reader.fieldnames)) while 1: counter.input += 1 try: row = next(reader) except _csv.Error as msg: options.stderr.write("# error while parsing: %s\n" % (msg)) counter.errors += 1 continue except StopIteration: break if not row: break if f(row): writer.writerow(row) counter.output += 1 else: counter.filtered += 1 E.info("%s" % counter) E.stop()