def main(fname): i = 0 with open(fname, "rb") as f: c, enc, dialect = smart_csv.csv_open(f) headers = c.next() fields = [Field(h.decode(enc).encode("utf8")) for h in headers[1:]] for row in c: for val, field in zip(row[1:], fields): field.add(val.decode(enc).encode("utf8")) i += 1 if i % 10000 == 0: print i i = 0 skips = 0 with open(fname) as f: c, enc, dialect = smart_csv.csv_open(f) headers = c.next() tfields = [field.to_typed() for field in fields] for row in c: for val, tfield in zip(row[1:], tfields): try: tfield.add(val.decode(enc).encode("utf8")) except (ValueError, TypeError): skips += 1 print "Skipping", val, tfield.__class__.__name__ i += 1 if i % 10000 == 0: print i print headers[0].decode(enc) for tfield in tfields: print tfield
def add_csv_rows(incsv_name_base, incsv_name_new, outcsv_name, new_loc=-1, id_col=-1): new_loc = int(new_loc) id_col = int(id_col) with open(incsv_name_base) as inf1: cin1, _, _ = smart_csv.csv_open(inf1) headers1 = cin1.next() if new_loc == -1: new_loc = len(headers1) with open(incsv_name_new) as inf2: cin2, _, _ = smart_csv.csv_open(inf2) headers2 = cin2.next() if id_col == -1: print 'no id_col' else: print 'id_col is', headers2[id_col] del headers2[id_col] with open(outcsv_name, 'wt') as outf: outheaders = headers1[:new_loc] + headers2 + headers1[new_loc:] cout = csv.writer(outf) cout.writerow(outheaders) for row1 in cin1: row2 = cin2.next() del row2[id_col] cout.writerow(row1[:new_loc] + row2 + row1[new_loc:])
def add_rows(in1, in2, outname): with open(in1) as fin1: with open(in2) as fin2: cin1 = smart_csv.csv_open(fin1)[0] cin2 = smart_csv.csv_open(fin2)[0] h1 = cin1.next() h2 = cin2.next() if h1 != h2: raise ValueError(str(h1) + ' ' + str(h2)) with open(outname, 'wt') as fout: cout = csv.writer(fout) cout.writerow(h1) for x in cin1: cout.writerow(x) for x in cin2: cout.writerow(x)
def read_dict(fname, key, vals): d = {} with open(fname) as fin: c = smart_csv.csv_open(fin)[0] headers = c.next() keyidx = headers.index(key) validx = [headers.index(val) for val in vals] for x in c: d[x[keyidx]] = [x[i] for i in validx] return d
def filter_columns(in_name, out_name, keep, killcols): if keep: colfilt = keepf else: colfilt = killf with open(in_name) as fin: cin = smart_csv.csv_open(fin)[0] header = cin.next() killidx = set([header.index(k) for k in killcols]) with open(out_name, 'wt') as fout: cout = csv.writer(fout) cout.writerow(colfilt(header, killidx)) for x in cin: cout.writerow(colfilt(x, killidx))
def merge_csv(orig_csv, target_idx, d, key, vals, out_csv, keep): with open(orig_csv) as fin: cin = smart_csv.csv_open(fin)[0] with open(out_csv, 'wt') as fout: cout = csv.writer(fout) headers = cin.next() keyidx = headers.index(key) def merge_row(row, vals): del row[keyidx] return row[:target_idx] + vals + row[target_idx:] cout.writerow(merge_row(headers, vals)) for x in cin: cout.writerow(merge_row(x, d.get(x[keyidx], [''] * len(vals))))
def kill_outliers(incsv_name, outcsv_name, outliers): skipped = 0 kept = 0 with open(incsv_name) as inf: inc, _, _ = smart_csv.csv_open(inf) with open(outcsv_name, 'wt') as outf: outc = csv.writer(outf) outc.writerow(inc.next()) for row in inc: if has_outliers(outliers, row): skipped += 1 print row else: outc.writerow(row) kept += 1 print 'kept', kept, 'skipped', skipped
def filter_columns(colname, incsvname, outcsvname): with open(colname, 'rb') as columnfile: i = 0 for row in columnfile: if row.split(',')[0].strip()[:1] == '+': columns.append(i) i += 1 with open(incsvname, 'rb') as incsvfile: with open(outcsvname, 'wb') as outcsvfile: csvreader,_, _ = smart_csv.csv_open(open(incsvname)) # csv.reader(incsvfile) csvwriter = csv.writer(outcsvfile) headers = csvreader.next() headerline = [headers[c] for c in columns] csvwriter.writerow(headerline) for row in csvreader: csvwriter.writerow([row[c] for c in columns])