def filterBadValues(in_filename, out_filename, keys): print 'filterBadValues', in_filename, out_filename, keys fin = open(in_filename, 'rt') fout = open(out_filename, 'wt') header = csv.readCsvLine(fin) csv.writeCsvRow(fout, header) print header data = csv.readCsvGen(fin) column_index = dict(zip(keys, [header.index(k) for k in keys])) num_rows = 0 num_bad = 0 for row in data: bad_row = False for k in keys: val = float(row[column_index[k]]) if val < 0: bad_row = True num_bad += 1 if not bad_row: csv.writeCsvRow(fout, row) num_rows += 1 fin.close() fout.close() print in_filename, num_rows, 'rows' print out_filename, num_rows - num_bad, 'rows'
def sampleCsv(in_filename, out_filename, ratio): """ Sample a csv file. """ print 'sampleCsv', in_filename, out_filename, ratio fin = open(in_filename, 'rt') fout = open(out_filename, 'wt') header = csv.readCsvLine(fin) print 'header:', header csv.writeCsvRow(fout, header) data = csv.readCsvGen(fin) num_sampled = 0 for irow,row in enumerate(data): if irow % 100000 == 0: print (irow,num_sampled), if num_sampled < ratio * irow: csv.writeCsvRow(fout, row) num_sampled += 1 print fin.close() fout.close() print in_filename, irow, 'rows' print out_filename, num_sampled, 'rows' if True: fin = open(out_filename, 'rt') header = csv.readCsvLine(fin) data = csv.readCsvGen(fin) for irow,row in enumerate(data): if len(row) != len(header): print irow, len(row), row, len(header), header assert(len(row) == len(header)) fin.close()