def prepare_data(db): # Processes/ methods to be called at the beginning of the pop_synthesis process dbc = db.cursor() # Identifying the number of housing units to build the Master Matrix dbc.execute('select * from housing_pums') housing_units = dbc.rowcount ti = time.clock() # Identifying the control variables for the households, gq's, and persons hhld_control_variables = adjusting_pums_joint_distribution.choose_control_variables(db, 'hhld') gq_control_variables = adjusting_pums_joint_distribution.choose_control_variables(db, 'gq') person_control_variables = adjusting_pums_joint_distribution.choose_control_variables(db, 'person') # Identifying the number of categories within each control variable for the households, gq's, and persons hhld_dimensions = numpy.asarray(adjusting_pums_joint_distribution.create_dimensions(db, 'hhld', hhld_control_variables)) gq_dimensions = numpy.asarray(adjusting_pums_joint_distribution.create_dimensions(db, 'gq', gq_control_variables)) person_dimensions = numpy.asarray(adjusting_pums_joint_distribution.create_dimensions(db, 'person', person_control_variables)) print 'Dimensions and Control Variables created in %.4f' %(time.clock()-ti) ti = time.clock() update_string = adjusting_pums_joint_distribution.create_update_string(db, hhld_control_variables, hhld_dimensions) adjusting_pums_joint_distribution.add_unique_id(db, 'hhld', update_string) update_string = adjusting_pums_joint_distribution.create_update_string(db, gq_control_variables, gq_dimensions) adjusting_pums_joint_distribution.add_unique_id(db, 'gq', update_string) update_string = adjusting_pums_joint_distribution.create_update_string(db, person_control_variables, person_dimensions) adjusting_pums_joint_distribution.add_unique_id(db, 'person', update_string) print 'Uniqueid\'s created in %.4f' %(time.clock()-ti) ti = time.clock() # Populating the Master Matrix populated_matrix = psuedo_sparse_matrix.populate_master_matrix(db, 0, housing_units, hhld_dimensions, gq_dimensions, person_dimensions) print 'Frequency Matrix Populated in %.4f' %(time.clock()-ti) ti = time.clock() # Sparse representation of the Master Matrix ps_sp_matrix = psuedo_sparse_matrix.psuedo_sparse_matrix(db, populated_matrix, 0) print 'Psuedo Sparse Representation of the Frequency Matrix created in %.4f' %(time.clock()-ti) ti = time.clock() #______________________________________________________________________ #Creating Index Matrix index_matrix = psuedo_sparse_matrix.generate_index_matrix(db, 0) print 'Index matrix created in %.4f' %(time.clock()-ti) ti = time.clock() dbc.close() #______________________________________________________________________ # creating synthetic_population tables in MySQL drawing_households.create_synthetic_attribute_tables(db) # Total PUMS Sample x composite_type adjustment for hhld adjusting_pums_joint_distribution.create_joint_dist(db, 'hhld', hhld_control_variables, hhld_dimensions, 0, 0, 0) # Total PUMS Sample x composite_type adjustment for gq adjusting_pums_joint_distribution.create_joint_dist(db, 'gq', gq_control_variables, gq_dimensions, 0, 0, 0) # Total PUMS Sample x composite_type adjustment for person adjusting_pums_joint_distribution.create_joint_dist(db, 'person', person_control_variables, person_dimensions, 0, 0, 0)
def configure_and_run(index_matrix, p_index_matrix, geoid): pumano = int(geoid[0]) tract = int(geoid[1]) bg = int(geoid[2]) print '------------------------------------------------------------------' print 'Geography: PUMA ID- %s, Tract ID- %0.2f, BG ID- %s' \ %(pumano, float(tract)/100, bg) print '------------------------------------------------------------------' db = MySQLdb.connect(host = 'localhost', user = '******', passwd = '1234', db = 'ncpopsyn') dbc = db.cursor() tii = time.clock() ti = time.clock() # Identifying the number of housing units in the disaggregate sample # Make Sure that the file is sorted by hhid dbc.execute('select * from housing_pums') housing_pums = numpy.asarray(dbc.fetchall())[:,1:] housing_units = dbc.rowcount # Identifying the control variables for the households, gq's, and persons hhld_control_variables = adjusting_pums_joint_distribution.choose_control_variables(db, 'hhld') gq_control_variables = adjusting_pums_joint_distribution.choose_control_variables(db, 'gq') person_control_variables = adjusting_pums_joint_distribution.choose_control_variables(db, 'person') # Identifying the number of categories within each control variable for the households, gq's, and persons hhld_dimensions = numpy.asarray(adjusting_pums_joint_distribution.create_dimensions(db, 'hhld', hhld_control_variables)) gq_dimensions = numpy.asarray(adjusting_pums_joint_distribution.create_dimensions(db, 'gq', gq_control_variables)) person_dimensions = numpy.asarray(adjusting_pums_joint_distribution.create_dimensions(db, 'person', person_control_variables)) #______________________________________________________________________ # Creating the sparse array dbc.execute('select * from sparse_matrix1_%s' %(0)) sp_matrix = numpy.asarray(dbc.fetchall()) #______________________________________________________________________ # Running IPF for Households print 'Step 1A: Running IPF procedure for Households... ' hhld_objective_frequency, hhld_estimated_constraint = ipf.ipf_config_run(db, 'hhld', hhld_control_variables, hhld_dimensions, pumano, tract, bg) print 'IPF procedure for Households completed in %.2f sec \n'%(time.clock()-ti) ti = time.clock() # Running IPF for GQ print 'Step 1B: Running IPF procedure for Gqs... ' gq_objective_frequency, gq_estimated_constraint = ipf.ipf_config_run(db, 'gq', gq_control_variables, gq_dimensions, pumano, tract, bg) print 'IPF procedure for GQ was completed in %.2f sec \n'%(time.clock()-ti) ti = time.clock() # Running IPF for Persons print 'Step 1C: Running IPF procedure for Persons... ' person_objective_frequency, person_estimated_constraint = ipf.ipf_config_run(db, 'person', person_control_variables, person_dimensions, pumano, tract, bg) print 'IPF procedure for Persons completed in %.2f sec \n'%(time.clock()-ti) ti = time.clock() #______________________________________________________________________ # Creating the weights array print 'Step 2: Running IPU procedure for obtaining weights that satisfy Household and Person type constraints... ' dbc.execute('select rowno from sparse_matrix1_%s group by rowno'%(0)) result = numpy.asarray(dbc.fetchall())[:,0] weights = numpy.ones((1,housing_units), dtype = float)[0] * -99 weights[result]=1 #______________________________________________________________________ # Creating the control array total_constraint = numpy.hstack((hhld_estimated_constraint[:,0], gq_estimated_constraint[:,0], person_estimated_constraint[:,0])) #______________________________________________________________________ # Running the heuristic algorithm for the required geography weights, conv_crit_array, wts_array = heuristic_algorithm.heuristic_adjustment(db, 0, index_matrix, weights, total_constraint, sp_matrix) print 'IPU procedure was completed in %.2f sec\n'%(time.clock()-ti) ti = time.clock() #_________________________________________________________________ print 'Step 3: Creating the synthetic households and individuals...' # creating whole marginal values hhld_order_dummy = adjusting_pums_joint_distribution.create_aggregation_string(hhld_control_variables) hhld_frequencies = drawing_households.create_whole_frequencies(db, 'hhld', hhld_order_dummy, pumano, tract, bg) gq_order_dummy = adjusting_pums_joint_distribution.create_aggregation_string(gq_control_variables) gq_frequencies = drawing_households.create_whole_frequencies(db, 'gq', gq_order_dummy, pumano, tract, bg) frequencies = numpy.hstack((hhld_frequencies[:,0], gq_frequencies[:,0])) #______________________________________________________________________ # Sampling Households and choosing the draw with the best match with with the objective distribution dbc.execute('select * from person_pums') person_pums = numpy.asarray(dbc.fetchall())[:,1:] p_value = 0 max_p = 0 min_chi = 1e10 draw_count = 0 while(p_value < 0.9999 and draw_count < 25): draw_count = draw_count + 1 synthetic_housing_units = drawing_households.drawing_housing_units(db, frequencies, weights, index_matrix, sp_matrix, 0) # Creating synthetic hhld, and person attribute tables synthetic_housing_attributes, synthetic_person_attributes = drawing_households.synthetic_population_properties(db, synthetic_housing_units, p_index_matrix, housing_pums, person_pums) synth_person_stat, count_person, person_estimated_frequency = drawing_households.checking_against_joint_distribution(person_objective_frequency, synthetic_person_attributes, person_dimensions, pumano, tract, bg) stat = synth_person_stat dof = count_person - 1 p_value = scipy.stats.stats.chisqprob(stat, dof) if p_value > max_p or stat < min_chi: max_p = p_value max_p_housing_attributes = synthetic_housing_attributes max_p_person_attributes = synthetic_person_attributes min_chi = stat if draw_count >=25: print 'Max Iterations reached for drawing households with the best draw having a p-value of %.4f' %(max_p) else: print 'Population with desirable p-value of %.4f was obtained in %d iterations' %(max_p, draw_count) drawing_households.storing_synthetic_attributes(db, 'housing', max_p_housing_attributes, pumano, tract, bg) drawing_households.storing_synthetic_attributes(db, 'person', max_p_person_attributes, pumano, tract, bg) dbc.execute('select hhtotal from housing_marginals where pumano = %s and tract = %s and bg = %s'%(pumano, tract, bg)) housingtotal = dbc.fetchall()[0][0] dbc.execute('select sum(gender1 + gender2) from person_marginals where pumano = %s and tract = %s and bg = %s'%(pumano, tract, bg)) persontotal = dbc.fetchall()[0][0] print 'Number of Synthetic Household - %d, and given Household total from the Census SF - %d' %(sum(max_p_housing_attributes[:,-2]), housingtotal) print 'Number of Synthetic Persons - %d and given Person total from the Census SF - %d' %(sum(max_p_person_attributes[:,-1]), persontotal) print 'Synthetic households created for the geography in %.2f\n' %(time.clock()-ti) db.commit() dbc.close() db.close()
def configure_and_run(index_matrix, p_index_matrix, geoid): pumano = int(geoid[0]) tract = int(geoid[1]) bg = int(geoid[2]) print '------------------------------------------------------------------' print 'Geography: PUMA ID- %s, Tract ID- %0.2f, BG ID- %s' \ %(pumano, float(tract)/100, bg) print '------------------------------------------------------------------' db = MySQLdb.connect(host='localhost', user='******', passwd='1234', db='ncpopsyn') dbc = db.cursor() tii = time.clock() ti = time.clock() # Identifying the number of housing units in the disaggregate sample # Make Sure that the file is sorted by hhid dbc.execute('select * from housing_pums') housing_pums = numpy.asarray(dbc.fetchall())[:, 1:] housing_units = dbc.rowcount # Identifying the control variables for the households, gq's, and persons hhld_control_variables = adjusting_pums_joint_distribution.choose_control_variables( db, 'hhld') gq_control_variables = adjusting_pums_joint_distribution.choose_control_variables( db, 'gq') person_control_variables = adjusting_pums_joint_distribution.choose_control_variables( db, 'person') # Identifying the number of categories within each control variable for the households, gq's, and persons hhld_dimensions = numpy.asarray( adjusting_pums_joint_distribution.create_dimensions( db, 'hhld', hhld_control_variables)) gq_dimensions = numpy.asarray( adjusting_pums_joint_distribution.create_dimensions( db, 'gq', gq_control_variables)) person_dimensions = numpy.asarray( adjusting_pums_joint_distribution.create_dimensions( db, 'person', person_control_variables)) #______________________________________________________________________ # Creating the sparse array dbc.execute('select * from sparse_matrix1_%s' % (0)) sp_matrix = numpy.asarray(dbc.fetchall()) #______________________________________________________________________ # Running IPF for Households print 'Step 1A: Running IPF procedure for Households... ' hhld_objective_frequency, hhld_estimated_constraint = ipf.ipf_config_run( db, 'hhld', hhld_control_variables, hhld_dimensions, pumano, tract, bg) print 'IPF procedure for Households completed in %.2f sec \n' % ( time.clock() - ti) ti = time.clock() # Running IPF for GQ print 'Step 1B: Running IPF procedure for Gqs... ' gq_objective_frequency, gq_estimated_constraint = ipf.ipf_config_run( db, 'gq', gq_control_variables, gq_dimensions, pumano, tract, bg) print 'IPF procedure for GQ was completed in %.2f sec \n' % (time.clock() - ti) ti = time.clock() # Running IPF for Persons print 'Step 1C: Running IPF procedure for Persons... ' person_objective_frequency, person_estimated_constraint = ipf.ipf_config_run( db, 'person', person_control_variables, person_dimensions, pumano, tract, bg) print 'IPF procedure for Persons completed in %.2f sec \n' % ( time.clock() - ti) ti = time.clock() #______________________________________________________________________ # Creating the weights array print 'Step 2: Running IPU procedure for obtaining weights that satisfy Household and Person type constraints... ' dbc.execute('select rowno from sparse_matrix1_%s group by rowno' % (0)) result = numpy.asarray(dbc.fetchall())[:, 0] weights = numpy.ones((1, housing_units), dtype=float)[0] * -99 weights[result] = 1 #______________________________________________________________________ # Creating the control array total_constraint = numpy.hstack( (hhld_estimated_constraint[:, 0], gq_estimated_constraint[:, 0], person_estimated_constraint[:, 0])) #______________________________________________________________________ # Running the heuristic algorithm for the required geography weights, conv_crit_array, wts_array = heuristic_algorithm.heuristic_adjustment( db, 0, index_matrix, weights, total_constraint, sp_matrix) print 'IPU procedure was completed in %.2f sec\n' % (time.clock() - ti) ti = time.clock() #_________________________________________________________________ print 'Step 3: Creating the synthetic households and individuals...' # creating whole marginal values hhld_order_dummy = adjusting_pums_joint_distribution.create_aggregation_string( hhld_control_variables) hhld_frequencies = drawing_households.create_whole_frequencies( db, 'hhld', hhld_order_dummy, pumano, tract, bg) gq_order_dummy = adjusting_pums_joint_distribution.create_aggregation_string( gq_control_variables) gq_frequencies = drawing_households.create_whole_frequencies( db, 'gq', gq_order_dummy, pumano, tract, bg) frequencies = numpy.hstack((hhld_frequencies[:, 0], gq_frequencies[:, 0])) #______________________________________________________________________ # Sampling Households and choosing the draw with the best match with with the objective distribution dbc.execute('select * from person_pums') person_pums = numpy.asarray(dbc.fetchall())[:, 1:] p_value = 0 max_p = 0 min_chi = 1e10 draw_count = 0 while (p_value < 0.9999 and draw_count < 25): draw_count = draw_count + 1 synthetic_housing_units = drawing_households.drawing_housing_units( db, frequencies, weights, index_matrix, sp_matrix, 0) # Creating synthetic hhld, and person attribute tables synthetic_housing_attributes, synthetic_person_attributes = drawing_households.synthetic_population_properties( db, synthetic_housing_units, p_index_matrix, housing_pums, person_pums) synth_person_stat, count_person, person_estimated_frequency = drawing_households.checking_against_joint_distribution( person_objective_frequency, synthetic_person_attributes, person_dimensions, pumano, tract, bg) stat = synth_person_stat dof = count_person - 1 p_value = scipy.stats.stats.chisqprob(stat, dof) if p_value > max_p or stat < min_chi: max_p = p_value max_p_housing_attributes = synthetic_housing_attributes max_p_person_attributes = synthetic_person_attributes min_chi = stat if draw_count >= 25: print 'Max Iterations reached for drawing households with the best draw having a p-value of %.4f' % ( max_p) else: print 'Population with desirable p-value of %.4f was obtained in %d iterations' % ( max_p, draw_count) drawing_households.storing_synthetic_attributes(db, 'housing', max_p_housing_attributes, pumano, tract, bg) drawing_households.storing_synthetic_attributes(db, 'person', max_p_person_attributes, pumano, tract, bg) dbc.execute( 'select hhtotal from housing_marginals where pumano = %s and tract = %s and bg = %s' % (pumano, tract, bg)) housingtotal = dbc.fetchall()[0][0] dbc.execute( 'select sum(gender1 + gender2) from person_marginals where pumano = %s and tract = %s and bg = %s' % (pumano, tract, bg)) persontotal = dbc.fetchall()[0][0] print 'Number of Synthetic Household - %d, and given Household total from the Census SF - %d' % ( sum(max_p_housing_attributes[:, -2]), housingtotal) print 'Number of Synthetic Persons - %d and given Person total from the Census SF - %d' % ( sum(max_p_person_attributes[:, -1]), persontotal) print 'Synthetic households created for the geography in %.2f\n' % ( time.clock() - ti) db.commit() dbc.close() db.close()
db.commit() return index_matrix if __name__ == '__main__': sample_size = 156601 pumano = 0 db = MySQLdb.connect(user = '******', passwd = '1234', db = 'ncpopsyn') hhld_dimensions = arr([5,7,8]) person_dimensions = arr([2, 10, 7]) hhld_control_variables = adjusting_pums_joint_distribution.choose_control_variables(db, 'hhld') person_control_variables = adjusting_pums_joint_distribution.choose_control_variables(db, 'person') update_string = adjusting_pums_joint_distribution.create_update_string(db, hhld_control_variables, hhld_dimensions) adjusting_pums_joint_distribution.add_unique_id(db, 'hhld', update_string) update_string = adjusting_pums_joint_distribution.create_update_string(db, person_control_variables, person_dimensions) adjusting_pums_joint_distribution.add_unique_id(db, 'person', update_string) ti = time.clock() print 'start - %s'%ti populated_matrix = populate_master_matrix(db, pumano, sample_size, hhld_dimensions, person_dimensions) print 'End Populated matrix - %s'%(time.clock()-ti) ti = time.clock()
dbc.close() db.commit() return index_matrix if __name__ == '__main__': sample_size = 156601 pumano = 0 db = MySQLdb.connect(user='******', passwd='1234', db='ncpopsyn') hhld_dimensions = arr([5, 7, 8]) person_dimensions = arr([2, 10, 7]) hhld_control_variables = adjusting_pums_joint_distribution.choose_control_variables( db, 'hhld') person_control_variables = adjusting_pums_joint_distribution.choose_control_variables( db, 'person') update_string = adjusting_pums_joint_distribution.create_update_string( db, hhld_control_variables, hhld_dimensions) adjusting_pums_joint_distribution.add_unique_id(db, 'hhld', update_string) update_string = adjusting_pums_joint_distribution.create_update_string( db, person_control_variables, person_dimensions) adjusting_pums_joint_distribution.add_unique_id(db, 'person', update_string) ti = time.clock() print 'start - %s' % ti populated_matrix = populate_master_matrix(db, pumano, sample_size,
index_matrix = arr(result) dbc.close() return index_matrix if __name__ == "__main__": sample_size = 156601 pumano = 0 db = MySQLdb.connect(user="******", passwd="1234", db="ncpopsyn") hhld_dimensions = arr([5, 7, 8]) person_dimensions = arr([2, 10, 7]) hhld_control_variables = adjusting_pums_joint_distribution.choose_control_variables(db, "hhld") person_control_variables = adjusting_pums_joint_distribution.choose_control_variables(db, "person") update_string = adjusting_pums_joint_distribution.create_update_string(db, hhld_control_variables, hhld_dimensions) adjusting_pums_joint_distribution.add_unique_id(db, "hhld", update_string) update_string = adjusting_pums_joint_distribution.create_update_string( db, person_control_variables, person_dimensions ) adjusting_pums_joint_distribution.add_unique_id(db, "person", update_string) ti = time.clock() print "start - %s" % ti populated_matrix = populate_master_matrix(db, pumano, sample_size, hhld_dimensions, person_dimensions) print "End Populated matrix - %s" % (time.clock() - ti)
def prepare_data(db): # Processes/ methods to be called at the beginning of the pop_synthesis process dbc = db.cursor() # Identifying the number of housing units to build the Master Matrix dbc.execute('select * from housing_pums') housing_units = dbc.rowcount ti = time.clock() # Identifying the control variables for the households, gq's, and persons hhld_control_variables = adjusting_pums_joint_distribution.choose_control_variables( db, 'hhld') gq_control_variables = adjusting_pums_joint_distribution.choose_control_variables( db, 'gq') person_control_variables = adjusting_pums_joint_distribution.choose_control_variables( db, 'person') # Identifying the number of categories within each control variable for the households, gq's, and persons hhld_dimensions = numpy.asarray( adjusting_pums_joint_distribution.create_dimensions( db, 'hhld', hhld_control_variables)) gq_dimensions = numpy.asarray( adjusting_pums_joint_distribution.create_dimensions( db, 'gq', gq_control_variables)) person_dimensions = numpy.asarray( adjusting_pums_joint_distribution.create_dimensions( db, 'person', person_control_variables)) print 'Dimensions and Control Variables created in %.4f' % (time.clock() - ti) ti = time.clock() update_string = adjusting_pums_joint_distribution.create_update_string( db, hhld_control_variables, hhld_dimensions) adjusting_pums_joint_distribution.add_unique_id(db, 'hhld', update_string) update_string = adjusting_pums_joint_distribution.create_update_string( db, gq_control_variables, gq_dimensions) adjusting_pums_joint_distribution.add_unique_id(db, 'gq', update_string) update_string = adjusting_pums_joint_distribution.create_update_string( db, person_control_variables, person_dimensions) adjusting_pums_joint_distribution.add_unique_id(db, 'person', update_string) print 'Uniqueid\'s created in %.4f' % (time.clock() - ti) ti = time.clock() # Populating the Master Matrix populated_matrix = psuedo_sparse_matrix.populate_master_matrix( db, 0, housing_units, hhld_dimensions, gq_dimensions, person_dimensions) print 'Frequency Matrix Populated in %.4f' % (time.clock() - ti) ti = time.clock() # Sparse representation of the Master Matrix ps_sp_matrix = psuedo_sparse_matrix.psuedo_sparse_matrix( db, populated_matrix, 0) print 'Psuedo Sparse Representation of the Frequency Matrix created in %.4f' % ( time.clock() - ti) ti = time.clock() #______________________________________________________________________ #Creating Index Matrix index_matrix = psuedo_sparse_matrix.generate_index_matrix(db, 0) print 'Index matrix created in %.4f' % (time.clock() - ti) ti = time.clock() dbc.close() #______________________________________________________________________ # creating synthetic_population tables in MySQL drawing_households.create_synthetic_attribute_tables(db) # Total PUMS Sample x composite_type adjustment for hhld adjusting_pums_joint_distribution.create_joint_dist( db, 'hhld', hhld_control_variables, hhld_dimensions, 0, 0, 0) # Total PUMS Sample x composite_type adjustment for gq adjusting_pums_joint_distribution.create_joint_dist( db, 'gq', gq_control_variables, gq_dimensions, 0, 0, 0) # Total PUMS Sample x composite_type adjustment for person adjusting_pums_joint_distribution.create_joint_dist( db, 'person', person_control_variables, person_dimensions, 0, 0, 0)