示例#1
0
def main(fname):
    i = 0
    with open(fname, "rb") as f:
        c, enc, dialect = smart_csv.csv_open(f)
        headers = c.next()
        fields = [Field(h.decode(enc).encode("utf8")) for h in headers[1:]]

        for row in c:
            for val, field in zip(row[1:], fields):
                field.add(val.decode(enc).encode("utf8"))

            i += 1
            if i % 10000 == 0:
                print i

    i = 0
    skips = 0
    with open(fname) as f:
        c, enc, dialect = smart_csv.csv_open(f)
        headers = c.next()
        tfields = [field.to_typed() for field in fields]

        for row in c:
            for val, tfield in zip(row[1:], tfields):
                try:
                    tfield.add(val.decode(enc).encode("utf8"))
                except (ValueError, TypeError):
                    skips += 1
                    print "Skipping", val, tfield.__class__.__name__
            i += 1
            if i % 10000 == 0:
                print i
        print headers[0].decode(enc)
        for tfield in tfields:
            print tfield
示例#2
0
def add_csv_rows(incsv_name_base, incsv_name_new, outcsv_name, new_loc=-1, id_col=-1):
    new_loc = int(new_loc)
    id_col = int(id_col)
    with open(incsv_name_base) as inf1:
        cin1, _, _ = smart_csv.csv_open(inf1)
        headers1 = cin1.next()
        if new_loc == -1:
            new_loc = len(headers1)
        with open(incsv_name_new) as inf2:
            cin2, _, _ = smart_csv.csv_open(inf2)
            headers2 = cin2.next()
            if id_col == -1:
                print 'no id_col'
            else:
                print 'id_col is', headers2[id_col]
                del headers2[id_col]

            with open(outcsv_name, 'wt') as outf:
                outheaders = headers1[:new_loc] + headers2 + headers1[new_loc:]
                cout = csv.writer(outf)
                cout.writerow(outheaders)
                for row1 in cin1:
                    row2 = cin2.next()
                    del row2[id_col]
                    cout.writerow(row1[:new_loc] + row2 + row1[new_loc:])
示例#3
0
def add_rows(in1, in2, outname):
    with open(in1) as fin1:
        with open(in2) as fin2:
            cin1 = smart_csv.csv_open(fin1)[0]
            cin2 = smart_csv.csv_open(fin2)[0]
            h1 = cin1.next()
            h2 = cin2.next()
            if h1 != h2:
                raise ValueError(str(h1) + ' ' + str(h2))
            with open(outname, 'wt') as fout:
                cout = csv.writer(fout)
                cout.writerow(h1)
                for x in cin1:
                    cout.writerow(x)
                for x in cin2:
                    cout.writerow(x)
示例#4
0
def read_dict(fname, key, vals):
    d = {}
    with open(fname) as fin:
        c = smart_csv.csv_open(fin)[0]
        headers = c.next()
        keyidx = headers.index(key)
        validx = [headers.index(val) for val in vals]
        for x in c:
            d[x[keyidx]] = [x[i] for i in validx]
    return d
示例#5
0
def filter_columns(in_name, out_name, keep, killcols):
    if keep:
        colfilt = keepf
    else:
        colfilt = killf
    with open(in_name) as fin:
        cin = smart_csv.csv_open(fin)[0]
        header = cin.next()
        killidx = set([header.index(k) for k in killcols])
        with open(out_name, 'wt') as fout:
            cout = csv.writer(fout)
            cout.writerow(colfilt(header, killidx))
            for x in cin:
                cout.writerow(colfilt(x, killidx))
示例#6
0
def merge_csv(orig_csv, target_idx, d, key, vals, out_csv, keep):
    with open(orig_csv) as fin:
        cin = smart_csv.csv_open(fin)[0]
        with open(out_csv, 'wt') as fout:
            cout = csv.writer(fout)
            headers = cin.next()
            keyidx = headers.index(key)

            def merge_row(row, vals):
                del row[keyidx]
                return row[:target_idx] + vals + row[target_idx:]

            cout.writerow(merge_row(headers, vals))

            for x in cin:
                cout.writerow(merge_row(x, d.get(x[keyidx], [''] * len(vals))))
示例#7
0
def kill_outliers(incsv_name, outcsv_name, outliers):
    skipped = 0
    kept = 0
    with open(incsv_name) as inf:
        inc, _, _ = smart_csv.csv_open(inf)
        with open(outcsv_name, 'wt') as outf:
            outc = csv.writer(outf)
            outc.writerow(inc.next())

            for row in inc:
                if has_outliers(outliers, row):
                    skipped += 1
                    print row
                else:
                    outc.writerow(row)
                    kept += 1
    print 'kept', kept, 'skipped', skipped
示例#8
0
def filter_columns(colname, incsvname, outcsvname):
    with open(colname, 'rb') as columnfile:
        i = 0
        for row in columnfile:
            if row.split(',')[0].strip()[:1] == '+':
                columns.append(i)
            i += 1

    with open(incsvname, 'rb') as incsvfile:
        with open(outcsvname, 'wb') as outcsvfile:
            csvreader,_, _ = smart_csv.csv_open(open(incsvname)) # csv.reader(incsvfile)
            csvwriter = csv.writer(outcsvfile)
            headers = csvreader.next()
            headerline = [headers[c] for c in columns]
            csvwriter.writerow(headerline)
            for row in csvreader:
                csvwriter.writerow([row[c] for c in columns])