Пример #1
0
def check_types(filename, config_file):
    headers, rows = read_csv(filename=filename)
    regex_checks = regex_from_config_file(config_file)

    mismatched_count = 0
    unchecked_columns = set()
    for row in rows:
        for i, column in enumerate(row):
            if headers[i] not in regex_checks:
                if headers[i] not in unchecked_columns:
                    unchecked_columns.add(headers[i])
                    print(" ---> No type check for {}: {}".format(
                        headers[i], column))
                continue
            if not regex_checks[headers[i]].match(column):
                print(" ---> Mismatch on {}: {} {}".format(
                    headers[i], column, row))
                mismatched_count += 1

    print(" ---> Mismatched count: {}".format(mismatched_count))
    assert mismatched_count == 0
Пример #2
0
def importFromCSV(csv_filename, database_filename):
    """Used to convert from old .csv format to database"""
    import file_util
    data = list(file_util.read_csv(csv_filename))
    header = data[0]
    new_entries = list()

    header = [
        'date',
        'Weather At Home European region   (hadam3p_eu) Tasks ready to send',
        'Weather At Home Pacific North West region (hadam3p_pnw) Tasks ready to send',
        'Weather At Home Australia New Zealand region (hadam3p_anz) Tasks ready to send',
        'hadcm3n Tasks ready to send',
        #'RAPIT project (hadcm3n) Tasks ready to send',
        'hadam3p (Global model only) with MOSES II land scheme Tasks ready to send',
        'Total  Tasks ready to send',
        'Tasks in progress'
    ]

    for row in data[1:]:
        if len(row) == len(header) and int(
                row[0]) > 1406206802 - 7 * 24 * 60 * 60:
            for ix in range(1, len(row)):
                new_entries.append(
                    (header[ix].strip(), row[0], row[ix]))  # name, time, count
            pass  # added
        elif len(row) == 1:
            pass  # ignore
        else:
            print len(row), row

    d = Database(database_filename, 'server_status')
    d.insert(new_entries)

    header, data = d.select_column_view()
    print header
    for d in data:
        print d
Пример #3
0
    for line in output_csv:
        completed = False
        for i, item in enumerate(line):
            if headers[i] == 'completed' and item == "True":
                total_completed += 1
                completed = True
        for i, item in enumerate(line):
            if headers[i] == 'course_id':
                course_student_counts[item] += 1
                course_completion_rates[item]['completed' if completed else 'attempted'] += 1

    completion_rate = total_completed/len(output_csv)
    print(" ---> Completion rate overall: %-7s %s%%" % (len(output_csv), round(completion_rate*100, 2)))
    for course_id, rates in course_completion_rates.items():
        student_count = course_student_counts[course_id]
        completion_rate = rates['completed']/(rates['completed']+rates['attempted'])
        # print(" ---> Completion rate for %-30s: %-7s %s%%" % (course_id, student_count, round(completion_rate*100, 2)))
    
    return output_csv
    
if __name__ == '__main__':
    if len(sys.argv) < 4:
        print('Usage: python k_suppress.py infile.csv configfile outfile.csv k')
        sys.exit(1)
    
    headers, rows = read_csv(filename=sys.argv[1])
    delete_columns, qi_columns = columns_from_config_file(sys.argv[2])
    out_filename = sys.argv[3]
    k = int(sys.argv[4])
    
    k_suppress(headers, rows, delete_columns, qi_columns, out_filename, k)
Пример #4
0
#!/usr/bin/evn python3
# -*- coding: utf-8 -*-
"""
Created on 2019-02-18
@author Susan
Get the number of unique values in each column of a dataset.
"""
import sys
from file_util import read_csv, columns_from_config_file
from deidentifier_util import count_column_uniques

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print('Usage: python count_column_uniques.py file config_file')
        sys.exit(1)

    config = sys.argv[2]

    headers, rows = read_csv(sys.argv[1])
    deleted, qi_columns = columns_from_config_file(config)

    unique_values = count_column_uniques(rows, headers)

    for col in headers:
        if len(unique_values[col]) > 10:
            print(col, len(unique_values[col]))
            continue
        print(col, unique_values[col])