def makeDictFromDB(idFields, fname): ''' Create and return a dictionary keyed by a concatenation of fields with value the number of entries containing all and only those fields from a .db file. Open the named database file (assumed to be sql lite), get the contents, and then call makeDict(idFields, dbList) to create the dictionary, which is returned. :param idFields: List of indexes of the fields to be concatenated to form the dictionary key :param fname: List of entries in the database to be used to form the dictionary :return: a dictionary, keyed by the concatenation of the values of the index files, with values the number of items that have those values ''' c = dbOpen(fname) c.execute('SELECT * FROM source ORDER BY user_id') fulllist = c.fetchall() retDict = makeDict(idFields, fulllist) return retDict
#!/usr/bin/env python """ Given some set of databases, count the number of distinct values in a particular field. Right now, it is assumed that the table is named "source" """ from de_id_functions import dbOpen import sys def count_fields(c, fname, tbl_name): db_command = "Select " + fname + " from " + tbl_name " group by " + fname c.execute(db_command) return len(c.fetchall()) if __name__ == '__main__': for i in range(1, len(sys.argv)): print sys.argv[i] c = dbOpen(sys.argv[i]) print 'Number of unique user ids = ', str(count_fields(c, 'user_id', 'source')) print 'Number of unique user, class combinations = ', str(count_fields(c, 'user_id, course_id', 'source')) print ''
#!/usr/bin/env python ''' Run buildcountrygeneralizer.py to generate a group of generalization files for countries. The base name of the files that will be produced is countryGen, to which will be appended the string that is the first member of the pairs in bin_info, and with bin sizes that are the second of the pair in the list of bin_info. Current values are 0 (no binning), 5k, 10k, 15, 20k, and 25k. The script assumes that a pickled dictionary mapping countries to larger regions exists two directories above the script and is named 'country_continent' ''' import buildcountrygeneralizer as bcg from de_id_functions import dbOpen bin_info = [('01k', 1000), ('02k', 2000), ('03k', 3000), ('04k', 4000) ] cr = dbOpen('year.db') cr.execute('Select cc_by_ip from source') cc_list = cr.fetchall() for bi in bin_info: outfile = 'countryGen'+bi[0] cc_to_regFile = '../../country_continent' bin_size = bi[1] bcg.main(cc_list, outfile, cc_to_regFile, bin_size)
#!/usr/bin/env python ''' This runs the program to build the full set of records to suppress once the binning and suppression based on identification based on the classes for which a user enrolled. ''' import buildFullSuppressionSet from de_id_functions import dbOpen #bin_size = ['05', '10', '15', '20', '25'] bin_size = ['00'] k_values = [3, 4, 5, 6] cr = dbOpen('year.db') geo_base = 'countryGen' for k_val in k_values: for s in bin_size: class_supp = 'classSuppressSet' + str(k_val) + 'P' geo_suppress = geo_base + s + 'k' yob_fname = 'yobbin' + s + 'k' forum_fname = 'postbin' + s + 'k' suppress_out = 'fullSuppress'+ s + s + str(k_val) + 'P' buildFullSuppressionSet.main(cr, class_supp, geo_suppress, yob_fname, forum_fname, suppress_out, k_val)
dropClass(classlist, cdict[classlist], cdict, c, suppressionset, use_suppress) print count print len(suppressionset) sfile = open(outname, 'w') pickle.dump(suppressionset, sfile) sfile.close() if __name__ == '__main__': if len(sys.argv) < 3: print 'Usage: courseSetDeidentify.py dbname k-value {P,R}' print 'where P is suppression on level of participation and R is random' dbName = sys.argv[1] outname = 'classSuppressSet' k_val = int(sys.argv[2]) if sys.argv[3] == 'R': suppress_method = 'R' else: suppress_method = "P" outname = outname + str(k_val) + suppress_method c = dbOpen(dbName) try: c.execute("Create Index user_id_idx on source ('user_id')") except: pass c.execute('SELECT user_id, course_id FROM source ORDER BY user_id') user_class_list = c.fetchall() main(user_class_list, c, k_val, suppress_method, outname) dbClose(c)
use_suppress) print count print len(suppressionset) sfile = open(outname, 'w') pickle.dump(suppressionset, sfile) sfile.close() if __name__ == '__main__': if len(sys.argv) < 3: print 'Usage: courseSetDeidentify.py dbname k-value {P,R}' print 'where P is suppression on level of participation and R is random' dbName = sys.argv[1] outname = 'classSuppressSet' k_val = int(sys.argv[2]) if sys.argv[3] == 'R': suppress_method = 'R' else: suppress_method = "P" outname = outname + str(k_val) + suppress_method c = dbOpen(dbName) try: c.execute("Create Index user_id_idx on source ('user_id')") except: pass c.execute('SELECT user_id, course_id FROM source ORDER BY user_id') user_class_list = c.fetchall() main(user_class_list, k_val, suppress_method, outname) dbClose(c)
#!/usr/bin/env python from de_id_functions import dbOpen import sys if __name__ == '__main__': dbname = sys.argv[1] c = dbOpen(dbname) c.execute('Select course_id, user_id from source') all_rec = c.fetchall() users = set() for l in all_rec: users.add(l[1]) print 'Total number of records = ', len(all_rec) print 'Total number of users = ', len(users)