예제 #1
0
def read_csv_file(filename, required_fieldnames=None, varlen=False):
    """
    Read CSV file and check required fieldnames present; varlen if variable-length rows.
    """

    # print("Reading CSV file:", filename)
    with open(filename) as file:
        reader = csv.reader(file)
        rows = [row for row in reader]
        fieldnames = rows[0]
        rows = rows[1:]
        
        # gather, clean, and trim field names, eliminating blanks
        fieldnames = [ids.clean_id(fieldname) for fieldname in fieldnames]
        while len(fieldnames)>0 and fieldnames[-1]=='':
            fieldnames.pop()
        if len(set(fieldnames)) != len(fieldnames):
            raise Exception("Duplicate field name:" + str(fieldnames))

        # data rows
        row_dicts = []
        for row in rows:
            row = ["" if item==None else ids.clean_id(item) for item in row]
            while len(row)>0 and row[-1] == '':
                row.pop()
            if not varlen:
                if len(row) > len(fieldnames):
                    warnings.warn("Ignoring extra values in row:"+str(row))
                    row = row[:len(fieldnames)]
                while len(row) < len(fieldnames):
                    row.append("")
            row_dict = {}
            for (fieldname, value) in zip(fieldnames, row):
                row_dict[fieldname] = value
            if varlen:
                if len(row) < len(fieldnames)-1:
                    if len(row) > 0: # QUESTION: should this be >= 0?
                        warnings.warn("Ignoring too-short row:"+str(row))
                    continue
                last_fieldname = fieldnames[-1]
                last_value = tuple(row[len(fieldnames)-1:])
                row_dict[last_fieldname] = last_value
            row_dicts.append(row_dict)
        if required_fieldnames != None:
            # check that all required fieldnames are present
            required_fieldnames = [ids.clean_id(id) for id in required_fieldnames]
            missing_fieldnames = set(required_fieldnames).difference(set(fieldnames))
            if len(missing_fieldnames) > 0:
                raise Exception("File {} has fieldnames {}, while {} are required. Missing {}."
                                .format(filename, fieldnames,
                                        required_fieldnames, missing_fieldnames))
            # check to see if extra fieldnames present; warn user if so
            extra_fieldnames = set(fieldnames).difference(set(required_fieldnames))
            if len(extra_fieldnames) > 0:
                warnings.warn("File {} has extra fieldnames (ignored): {}"
                                .format(filename, extra_fieldnames))
        return row_dicts
예제 #2
0
def test_clean_id():

    assert ids.clean_id(" ab") == "ab"
    assert ids.clean_id("ab ") == "ab"
    assert ids.clean_id("  ab cd  ") == "ab cd"
    assert ids.clean_id("\t ab\n cd\n") == "ab cd"