def add_locations_not_conflicting_with_user_added_locations(db): locations_data = load_seed_data_from('location') location_location_tags = load_seed_data_from('location_location_tag') json_location_ids = [loc['id'] for loc in locations_data] locations_in_db = run_query(db, 'select id from location') locations_in_db = [loc['id'] for loc in locations_in_db] # Convert list to set for more efficiency. locations_in_db = set(locations_in_db) locations_to_add = [id for id in json_location_ids if id not in locations_in_db] if len(locations_to_add) > 0: cursor = db.cursor() insert_sql = 'insert into location(' for field in locations_data[0].keys(): insert_sql += '`' + field + '`,' insert_sql = insert_sql[:-1] + ') values(' for field in locations_data[0].keys(): insert_sql += '%s,' insert_sql = insert_sql[:-1] + ')' location_tag_insert_sql = 'insert into location_location_tag(id, location_id, location_tag_id) values(%s, %s, %s)' for location in locations_to_add: print 'Adding location ' + str(location) # find location by id. location = [loc for loc in locations_data if loc['id'] == location][0] cursor.execute(insert_sql, location.values()) location_tags = [loct for loct in location_location_tags if loct['location_id'] == location['id']] for location_tag in location_tags: new_guid = str(uuid.uuid4()) cursor.execute(location_tag_insert_sql, (new_guid, location_tag['location_id'], location_tag['location_tag_id'])) db.commit()
def add_locations_not_conflicting_with_user_added_locations(db): locations_data = load_seed_data_from('location') location_location_tags = load_seed_data_from('location_location_tag') json_location_ids = [loc['id'] for loc in locations_data] locations_in_db = run_query(db, 'select id from location') locations_in_db = [loc['id'] for loc in locations_in_db] locations_to_add = [id for id in json_location_ids if id not in locations_in_db] if len(locations_to_add) > 0: cursor = db.cursor() insert_sql = 'insert into location(' for field in locations_data[0].keys(): insert_sql += '`' + field + '`,' insert_sql = insert_sql[:-1] + ') values(' for field in locations_data[0].keys(): insert_sql += '%s,' insert_sql = insert_sql[:-1] + ')' location_tag_insert_sql = 'insert into location_location_tag(location_id, location_tag_id) values(%s, %s)' for location in locations_to_add: print 'Adding location ' + str(location) # find location by id. location = [loc for loc in locations_data if loc['id'] == location][0] cursor.execute(insert_sql, location.values()) location_tags = [loct for loct in location_location_tags if loct['location_id'] == location['id']] for location_tag in location_tags: cursor.execute(location_tag_insert_sql, (location_tag['location_id'], location_tag['location_tag_id'])) db.commit()
def test_location_references(self): user_answers = seed_io.load_seed_data_from('user_answer') location_location_tags = seed_io.load_seed_data_from('location_location_tag') review_comments = seed_io.load_seed_data_from('review_comment') locations = seed_io.load_seed_data_from('location') location_ids = get_key_from_list(locations, 'id') self.check_foreign_id(user_answers, 'location_id', location_ids, 'user_answer') self.check_foreign_id(location_location_tags, 'location_id', location_ids, 'location_location_tag') self.check_foreign_id(review_comments, 'location_id', location_ids, 'review_comment')
def test_location_references(self): user_answers = seed_io.load_seed_data_from('user_answer') location_location_tags = seed_io.load_seed_data_from( 'location_location_tag') review_comments = seed_io.load_seed_data_from('review_comment') locations = seed_io.load_seed_data_from('location') location_ids = get_key_from_list(locations, 'id') self.check_foreign_id(user_answers, 'location_id', location_ids, 'user_answer') self.check_foreign_id(location_location_tags, 'location_id', location_ids, 'location_location_tag') self.check_foreign_id(review_comments, 'location_id', location_ids, 'review_comment')
def nullify_ratings_cache_when_missing_questions(db): """ Sets ratings_cache to null when not all questions are specified in the ratings_cache value. This is a little sanitization of the database. Aspects of the rating calculation in the web application assume that if ratings_cache is not null, it must set values for every question. """ questions = load_seed_data_from('question') questions = [q['id'] for q in questions] locations_with_ratings_cache = run_query(db, 'select id, ratings_cache from location where ratings_cache is not null') location_ids_to_clear = [] for location in locations_with_ratings_cache: ratings = json.loads(location['ratings_cache']) for question_id in questions: if question_id not in ratings: location_ids_to_clear.append(location['id']) break if len(location_ids_to_clear) > 0: print('Clearing ratings_cache for %d locations.' % len(location_ids_to_clear)) group_size = 100 cursor = db.cursor() # Loop through groups. # We don't want to delete all at once because there may be a limit to the SQL query size. while (len(location_ids_to_clear) > 0): group_ids = location_ids_to_clear[0 : group_size] s = str(group_ids).replace('[', '(').replace(']', ')').replace('u', '') s = 'update location set ratings_cache=NULL where id in ' + s run_query(db, s) # Remove the elements that were already updated. location_ids_to_clear = location_ids_to_clear[len(group_ids):] db.commit()
def safely_remove_removed_locations(db): locations_data = load_seed_data_from('location') json_location_ids = [loc['id'] for loc in locations_data] locations_with_answers = run_query(db, 'select distinct location_id from user_answer union distinct select distinct location_id from review_comment') locations_with_answers = [loc['location_id'] for loc in locations_with_answers] locations_in_db = run_query(db, 'select id from location where creator_user_id is null') # Convert list to set for more efficiency. locations_with_answers = set(locations_with_answers) json_location_ids = set(json_location_ids) locations_safe_to_delete = [loc['id'] for loc in locations_in_db if loc['id'] not in locations_with_answers] locations_to_delete = [id for id in locations_safe_to_delete if id not in json_location_ids] if len(locations_to_delete) > 0: id_list = '(' for location_id in locations_to_delete: id_list += '%s, ' id_list = id_list[:-2] # remove trailing comma. id_list += ')' delete_location_location_tag_sql = 'delete from location_location_tag where location_id in ' + id_list stringified_ids = [str(id) for id in locations_to_delete] print 'removing locations: ' + (', '.join(stringified_ids)) cursor = db.cursor() cursor.execute(delete_location_location_tag_sql, locations_to_delete) delete_location_sql = 'delete from location where id in ' + id_list cursor.execute(delete_location_sql, locations_to_delete) db.commit()
def set_fields_on_location_tags(db): print 'setting fields on location_tags table' location_tags_data = load_seed_data_from('location_tag') cursor = db.cursor() for location_tag in location_tags_data: update_sql = 'update location_tag set description=%s, icon_selector=%s where id=%s' cursor.execute(update_sql, (location_tag['description'], location_tag['icon_selector'], location_tag['id']))
def safely_remove_removed_locations(db): locations_data = load_seed_data_from('location') json_location_ids = [loc['id'] for loc in locations_data] locations_with_answers = run_query(db, 'select distinct location_id from user_answer union distinct select distinct location_id from review_comment') locations_with_answers = [loc['location_id'] for loc in locations_with_answers] locations_in_db = run_query(db, 'select id from location where creator_user_id is null') locations_safe_to_delete = [loc['id'] for loc in locations_in_db if loc['id'] not in locations_with_answers] locations_to_delete = [id for id in locations_safe_to_delete if id not in json_location_ids] if len(locations_to_delete) > 0: id_list = '(' for location_id in locations_to_delete: id_list += '%s, ' id_list = id_list[:-2] # remove trailing comma. id_list += ')' delete_location_location_tag_sql = 'delete from location_location_tag where location_id in ' + id_list stringified_ids = [str(id) for id in locations_to_delete] print 'removing locations: ' + (', '.join(stringified_ids)) cursor = db.cursor() cursor.execute(delete_location_location_tag_sql, locations_to_delete) delete_location_sql = 'delete from location where id in ' + id_list cursor.execute(delete_location_sql, locations_to_delete) db.commit()
def test_locations_near(self): locations = seed_io.load_seed_data_from('location') container = LocationContainer(locations) self.assertFalse(container.is_empty()) # smoke test a couple methods. windsor = { 'id': '123', 'latitude': 42.3, 'longitude': -83 } index = container.get_bucket_index_from_latitude(windsor['latitude']) self.assertTrue(isinstance(index, int)) results = list(container.locations_near(windsor['longitude'], windsor['latitude'], 0.5)) self.assertTrue(len(results) > 0) container.insert(windsor) new_results = list(container.locations_near(windsor['longitude'], windsor['latitude'], 0.5)) self.assertTrue(len(new_results) == len(results) + 1) # Test that get_location_by_id works. windsor_lookup = container.get_location_by_id('123') self.assertIsNotNone(windsor_lookup) self.assertEqual(windsor['latitude'], windsor_lookup['latitude']) self.assertEqual(windsor['longitude'], windsor_lookup['longitude']) self.assertFalse(container.is_empty())
def test_locations_near(self): locations = seed_io.load_seed_data_from('location') container = LocationContainer(locations) self.assertFalse(container.is_empty()) # smoke test a couple methods. windsor = {'id': '123', 'latitude': 42.3, 'longitude': -83} index = container.get_bucket_index_from_latitude(windsor['latitude']) self.assertTrue(isinstance(index, int)) results = list( container.locations_near(windsor['longitude'], windsor['latitude'], 0.5)) self.assertTrue(len(results) > 0) container.insert(windsor) new_results = list( container.locations_near(windsor['longitude'], windsor['latitude'], 0.5)) self.assertTrue(len(new_results) == len(results) + 1) # Test that get_location_by_id works. windsor_lookup = container.get_location_by_id('123') self.assertIsNotNone(windsor_lookup) self.assertEqual(windsor['latitude'], windsor_lookup['latitude']) self.assertEqual(windsor['longitude'], windsor_lookup['longitude']) self.assertFalse(container.is_empty())
def add_missing_data(db, table_names): cursor = db.cursor() for table_name in table_names: json_data = load_seed_data_from(table_name) db_data = run_query(db, 'select id from ' + table_name) db_data = [row['id'] for row in db_data] new_data = [ new_row for new_row in json_data if new_row['id'] not in db_data ] if len(new_data) > 0: base_insert_sql = 'insert into `' + table_name + '`(' for field_name in new_data[0].keys(): base_insert_sql += '`' + field_name + '`,' base_insert_sql = base_insert_sql[:-1] + ') values(' # remove trailing comma. for new_row in new_data: values = [] insert_sql = base_insert_sql for field_name in new_row.keys(): values.append(new_row[field_name]) insert_sql += '%s,' insert_sql = insert_sql[:-1] + ')' # remove trailing comma. print 'Table ' + table_name + ' Inserting ' + str( new_row['id']) print 'SQL: ' + insert_sql print 'values: ' + str(values) cursor.execute(insert_sql, values) db.commit()
def replace_all_data(db, table_names): cursor = db.cursor() for table_name in table_names: run_query(db, 'delete from `' + table_name + '`') table_data = load_seed_data_from(table_name) for table_record_data in table_data: insert(cursor, table_name, table_record_data) db.commit()
def set_fields_on_locations(db): locations_data = load_seed_data_from('location') for location in locations_data: if location['external_web_url'] and len( location['external_web_url']) > 255: print 'external_web_url for location ' + str( location['id']) + ' is too long at ' + str( len(location['external_web_url'])) + '.' return # We're only concerned with locations that have either address, phone number, external_web_url, location_group_id or any combination so # let's filter out the useless data. # This may boost efficiency of the m*n time loop below by reducing m considerably. locations_data = [ location for location in locations_data if location['address'] or location['phone_number'] or location['external_web_url'] or location['location_group_id'] ] fields = [ 'address', 'phone_number', 'external_web_url', 'location_group_id', 'destroy_location_event_id' ] location_query = 'select * from location where 0' for field in fields: location_query += ' or %s is null or %s=\'\'' % (field, field) cur = db.cursor(MySQLdb.cursors.DictCursor) cur.execute(location_query) db_data = [row for row in cur.fetchall()] locations_data = utils.list_to_dict(locations_data) print 'May update up to ' + str(len(db_data)) + ' records' cursor = db.cursor() for db_location in db_data: location = None if db_location['id'] in locations_data: location = locations_data[db_location['id']] if location: fields_to_set = [] field_values = [] for field in fields: if location[field] and not db_location[field]: fields_to_set.append(field) field_values.append(location[field]) if len(field_values) > 0: update_sql = 'update location set ' for field in fields_to_set: update_sql += field + '=%s,' update_sql = update_sql[:-1] # remove trailing comma. update_sql += ' where id=\'' + str(location['id']) + '\'' print 'running: ' + update_sql cursor.execute(update_sql, field_values) db.commit()
def test_no_duplicate_location_location_tags(self): location_location_tags = seed_io.load_seed_data_from( 'location_location_tag') location_id_tag_ids = [ llt['location_id'] + '-' + str(llt['location_tag_id']) for llt in location_location_tags ] duplicates = self.find_duplicates(location_id_tag_ids) self.assertEqual(len(duplicates), 0, 'duplicates found: ' + str(duplicates))
def set_fields_on_questions(db): cursor = db.cursor() questions_data = load_seed_data_from('question') # Update order to prevent unique constraint violations # as order is updated in the following loop. cursor = db.cursor() for question_data in questions_data: update_sql = 'update question set question_html=%s, is_always_required=%s, `order`=%s, explanation=%s, is_required_config=%s, name=%s where id=%s' cursor.execute(update_sql, (question_data['question_html'], question_data['is_always_required'], question_data['order'], question_data['explanation'], question_data['is_required_config'], question_data['name'], question_data['id']))
def add_missing_data(db, table_names): cursor = db.cursor() for table_name in table_names: json_data = load_seed_data_from(table_name) db_data = run_query(db, 'select id from ' + table_name) db_data = [row['id'] for row in db_data] new_data = [new_row for new_row in json_data if new_row['id'] not in db_data] for new_row in new_data: insert(cursor, table_name, new_row) db.commit()
def update_coordinates_for_locations(db): location_ids = ['00000000-0000-0000-0000-000000000020'] locations_data = load_seed_data_from('location') cursor = db.cursor(MySQLdb.cursors.DictCursor) for location_id in location_ids: matching_location = [loc for loc in locations_data if loc['id'] == location_id][0] location_statement = ('update location set latitude=%s,longitude=%s where id=\'%s\'' % (matching_location['latitude'], matching_location['longitude'], location_id)) cursor.execute(location_statement) db.commit()
def set_fields_on_questions(db): cursor = db.cursor() questions_data = load_seed_data_from('question') # Update order to prevent unique constraint violations # as order is updated in the following loop. cursor = db.cursor() for question_data in questions_data: update_sql = 'update question set question_html=%s, is_always_required=%s, `order`=%s, explanation=%s where id=%s' cursor.execute(update_sql, (question_data['question_html'], question_data['is_always_required'], question_data['order'], question_data['explanation'], question_data['id']))
def add_missing_data_with_composite_keys(db, cursor, table_name, composite_keys): json_data = load_seed_data_from(table_name) sql = 'select ' for key in composite_keys: sql += '`' + key + '`,' sql = sql[:-1] + ' from `' + table_name + '`' db_data = run_query(db, sql) db_data = [get_hash_string(row.values()) for row in db_data] db_data = set(db_data) # The "in" operator works more efficiently on sets. new_data = [new_row for new_row in json_data if get_hash_string(filter_keys(new_row, composite_keys).values()) not in db_data] for new_row in new_data: insert(cursor, table_name, new_row)
def add_missing_data(db, table_names): cursor = db.cursor() for table_name in table_names: if isinstance(table_name, dict): add_missing_data_with_composite_keys(db, cursor, table_name['name'], table_name['composite_keys']) else: json_data = load_seed_data_from(table_name) db_data = run_query(db, 'select id from ' + table_name) db_data = [row['id'] for row in db_data] db_data = set(db_data) # The "in" operator works more efficiently on sets. new_data = [new_row for new_row in json_data if new_row['id'] not in db_data] for new_row in new_data: insert(cursor, table_name, new_row) db.commit()
def set_fields_on_locations(db): locations_data = load_seed_data_from('location') for location in locations_data: if location['external_web_url'] and len(location['external_web_url'])> 255: print 'external_web_url for location ' + str(location['id']) + ' is too long at ' + str(len(location['external_web_url'])) + '.' return # We're only concerned with locations that have either address, phone number, external_web_url, location_group_id or any combination so # let's filter out the useless data. # This may boost efficiency of the m*n time loop below by reducing m considerably. locations_data = [location for location in locations_data if location['address'] or location['phone_number'] or location['external_web_url'] or location['location_group_id']] fields = ['address', 'phone_number', 'external_web_url', 'location_group_id', 'destroy_location_event_id'] location_query = 'select * from location where 0' for field in fields: location_query += ' or %s is null or %s=\'\'' % (field, field) cur = db.cursor(MySQLdb.cursors.DictCursor) cur.execute(location_query) db_data = [row for row in cur.fetchall()] locations_data = utils.list_to_dict(locations_data) print 'May update up to ' + str(len(db_data)) + ' records' cursor = db.cursor() for db_location in db_data: location = None if db_location['id'] in locations_data: location = locations_data[db_location['id']] if location: fields_to_set = [] field_values = [] for field in fields: if location[field] and not db_location[field]: fields_to_set.append(field) field_values.append(location[field]) if len(field_values) > 0: update_sql = 'update location set ' for field in fields_to_set: update_sql += field + '=%s,' update_sql = update_sql[:-1] # remove trailing comma. update_sql += ' where id=\'' + str(location['id']) + '\'' print 'running: ' + update_sql cursor.execute(update_sql, field_values) db.commit()
def set_fields_on_locations(db): locations_data = load_seed_data_from('location') # We're only concerned with locations that have either address, phone number or both so # let's filter out the useless data. # This may boost efficiency of the m*n time loop below by reducing m considerably. locations_data = [ location for location in locations_data if location['address'] or location['phone_number'] ] fields = ['address', 'phone_number', 'external_web_url'] location_query = 'select * from location where 0' for field in fields: location_query += ' or %s is null or %s=\'\'' % (field, field) cur = db.cursor(MySQLdb.cursors.DictCursor) cur.execute(location_query) db_data = [row for row in cur.fetchall()] print 'May update up to ' + str(len(db_data)) + ' records' cursor = db.cursor() for db_location in db_data: location = find_match('location', locations_data, db_location) if location: fields_to_set = [] field_values = [] for field in fields: if location[field] and not db_location[field]: fields_to_set.append(field) field_values.append(location[field]) if len(field_values) > 0: update_sql = 'update location set ' for field in fields_to_set: update_sql += field + '=%s,' update_sql = update_sql[:-1] # remove trailing comma. update_sql += ' where id=' + str(location['id']) print 'running: ' + update_sql cursor.execute(update_sql, field_values) db.commit()
or: python csv_importer.py locations.csv The import configuration file describes the meaning of each column by relating them to either fields of the location table or names from the location_tag table. """ from import_helpers.task_loader import get_task_info import csv import sys, errno import import_helpers.seed_io as seed_io from import_helpers.merging import merge_location task = get_task_info() import_config = task['import_config'] locations = seed_io.load_seed_data_from('location') location_tags = seed_io.load_seed_data_from('location_tag') location_location_tags = seed_io.load_seed_data_from('location_location_tag') location_duplicates = seed_io.load_seed_data_from('location_duplicate') print('loaded ' + str(len(location_duplicates)) + ' location duplicates') with open(task['csv_filename']) as csv_file: if import_config['is_first_row_titles']: csv_file.readline() # skip the column titles row. csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"') num_values = len(import_config['columns']) # loop through lines of the file. for values in csv_reader: if len(values) != num_values:
or: python csv_importer.py locations.csv The import configuration file describes the meaning of each column by relating them to either fields of the location table or names from the location_tag table. """ from import_helpers.task_loader import get_task_info import csv import sys, errno import import_helpers.seed_io as seed_io from import_helpers.merging import merge_location task = get_task_info() import_config = task['import_config'] locations = seed_io.load_seed_data_from('location') location_tags = seed_io.load_seed_data_from('location_tag') location_location_tags = seed_io.load_seed_data_from('location_location_tag') location_duplicates = seed_io.load_seed_data_from('location_duplicate') user_answers = seed_io.load_seed_data_from('user_answer') print('loaded ' + str(len(location_duplicates)) + ' location duplicates') with open(task['csv_filename']) as csv_file: if import_config['is_first_row_titles']: csv_file.readline() # skip the column titles row. csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"') num_values = len(import_config['columns']) # loop through lines of the file. for values in csv_reader:
import import_helpers.seed_io as seed_io import import_helpers.location_name_sanitizer as location_name_sanitizer locations = seed_io.load_seed_data_from('location') for location in locations: location['name'] = location_name_sanitizer.sanitize_name(location['name']) seed_io.write_seed_data('location', locations)
The import configuration file describes the meaning of each column by relating them to either fields of the location table or names from the location_tag table. """ from import_helpers.task_loader import get_task_info import csv import sys, errno import import_helpers.seed_io as seed_io from import_helpers.merging import merge_location from import_helpers.location_container import LocationContainer from import_helpers.location_duplicate_container import LocationDuplicateContainer task = get_task_info() import_config = task['import_config'] locations = seed_io.load_seed_data_from('location') location_tags = seed_io.load_seed_data_from('location_tag') location_location_tags = seed_io.load_seed_data_from('location_location_tag') location_duplicates = seed_io.load_seed_data_from('location_duplicate') location_groups = seed_io.load_seed_data_from('location_group') user_answers = seed_io.load_seed_data_from('user_answer') locations = LocationContainer(locations) location_duplicates = LocationDuplicateContainer(location_duplicates) with open(task['csv_filename']) as csv_file: if import_config['is_first_row_titles']: csv_file.readline() # skip the column titles row. csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"') num_values = len(import_config['columns'])
def test_simplify_name_smoke_test_on_all_location_names(self): locations = seed_io.load_seed_data_from('location') for location in locations: duplicate_detection.simplify_name(location['name'])
def test_no_duplicate_location_location_tags(self): location_location_tags = seed_io.load_seed_data_from('location_location_tag') location_id_tag_ids = [llt['location_id'] + '-' + str(llt['location_tag_id']) for llt in location_location_tags] duplicates = self.find_duplicates(location_id_tag_ids) self.assertEqual(len(duplicates), 0, 'duplicates found: ' + str(duplicates))
def test_get_location_duplicates_by_name(self): duplicate_locations = LocationDuplicateContainer([]) self.assertTrue(isinstance(duplicate_locations.get_location_duplicates_by_name('bla'), list)) duplicate_locations = LocationDuplicateContainer(seed_io.load_seed_data_from('location_duplicate')) self.assertTrue(isinstance(duplicate_locations.get_location_duplicates_by_name('bla'), list))