continue rdict['source'] = source # No longer storing these del rdict['old_combined_id'] del rdict['old_source_id'] try: print i, rdict['place_origin'].decode(migtools.STRING_ENCODING), u", ", rdict['large1'].decode(migtools.STRING_ENCODING), u", ", rdict['large2'].decode(migtools.STRING_ENCODING), u", ", rdict['large3'].decode(migtools.STRING_ENCODING) except UnicodeEncodeError: # Windows decode error workaround print i, "<UnicodeEncodeError Encountered, ignoring for now>" try: rdict['location'] = migtools.get_or_add_location(unicode(rdict['place_origin'], migtools.STRING_ENCODING), mig_user, unicode(rdict['large1'], migtools.STRING_ENCODING), unicode(rdict['large2'], migtools.STRING_ENCODING), unicode(rdict['large3'], migtools.STRING_ENCODING)) except Location.DatabaseError as e: sys.stderr.write('Database error on getting or adding location in row (%i): %s\n' % (i, e)) sys.stderr.write('%s\n' % rdict) num_err_rows += 1 continue except migtools.LocationTooComplicated as e: sys.stderr.write('Location too complicated in row (%i): %s\n' % (i, e)) sys.stderr.write('%s\n' % rdict) num_err_rows += 1 continue del rdict['place_origin'] del rdict['large1'] del rdict['large2'] del rdict['large3']
def add_row(rdict, num_err_rows): if rdict['old_combined_id']: cid_matches = re.match(r'([^\.-]+)[\.-]([^\.-]+)', rdict['old_combined_id']) if not cid_matches: sys.stderr.write('Failed to match combined id %s in row (%i)\n' % (rdict['old_combined_id'], i)) sys.stderr.write('%s\n' % rdict) return num_err_rows + 1 source_id = cid_matches.group(1) table_id = cid_matches.group(2) #if source_id != rdict['old_source_id']: # sys.stderr.write('Mismatch of old source ID in row (%i)\n' % i) # sys.stderr.write('%s\n' % rdict) # return num_err_rows + 1 table = None try: table = Table.objects.get(old_id = table_id) except Table.DoesNotExist as e: sys.stderr.write('Source table does not exist in row (%i)\n' % i) sys.stderr.write('%s\n' % rdict) return num_err_rows + 1 #nr = cid_matches.group(3) #if not nr and table.nr != nr: # sys.stderr.write('Table NR mismatch in row (%i)\n' % i) # sys.stderr.write('%s\n' % rdict) # return num_err_rows + 1 rdict['source'] = table else: source = None try: Source.objects.get(old_id = rdict['old_source_id']) except Source.DoesNotExist as e: sys.stderr.write('Source does not exist in row (%i)\n' % i) sys.stderr.write('%s\n' % rdict) return num_err_rows + 1 rdict['source'] = source val_specified = False if rdict.has_key('individuals_population_value'): if len(rdict['individuals_population_value']) > 0 and rdict['individuals_population_value'] != 0: val_specified = True rdict['individ_fam'] = 0 rdict['population_value'] = rdict['individuals_population_value'] del rdict['individuals_population_value'] if rdict.has_key('families_population_value'): if len(rdict['families_population_value']) > 0 and rdict['families_population_value'] != 0: if val_specified: num_err_rows = add_row(rdict.copy(), num_err_rows) else: val_specified = True rdict['individ_fam'] = 1 rdict['population_value'] = rdict['families_population_value'] del rdict['families_population_value'] if rdict.has_key('male_population_value'): if len(rdict['male_population_value']) > 0 and rdict['male_population_value'] != 0: if val_specified: num_err_rows = add_row(rdict.copy(), num_err_rows) else: val_specified = True rdict['individ_fam'] = 0 rdict['population_value'] = rdict['male_population_value'] rdict['population_gender'] = 'm' del rdict['male_population_value'] if rdict.has_key('female_population_value'): if len(rdict['female_population_value']) > 0 and rdict['female_population_value'] != 0: if val_specified: num_err_rows = add_row(rdict.copy(), num_err_rows) else: val_specified = True rdict['individ_fam'] = 0 rdict['population_value'] = rdict['female_population_value'] rdict['population_gender'] = 'f' del rdict['female_population_value'] if not val_specified: #sys.stderr.write('Data entry with no data in row (%i)\n' % i) #sys.stderr.write('%s\n' % rdict) return num_err_rows + 1 try: print i, rdict['place_origin'].decode(migtools.STRING_ENCODING), u", ", rdict['large1'].decode(migtools.STRING_ENCODING), u", ", rdict['large2'].decode(migtools.STRING_ENCODING), u", ", rdict['large3'].decode(migtools.STRING_ENCODING) except UnicodeEncodeError: # Windows decode error workaround print i, "<UnicodeEncodeError Encountered, ignoring for now>" try: rdict['location'] = migtools.get_or_add_location(unicode(rdict['place_origin'], migtools.STRING_ENCODING), mig_user, unicode(rdict['large1'], migtools.STRING_ENCODING), unicode(rdict['large2'], migtools.STRING_ENCODING), unicode(rdict['large3'], migtools.STRING_ENCODING)) except Location.DatabaseError as e: sys.stderr.write('Database error on getting or adding location in row (%i): %s\n' % (i, e)) sys.stderr.write('%s\n' % rdict) return num_err_rows + 1 except migtools.LocationTooComplicated as e: sys.stderr.write('Location too complicated in row (%i): %s\n' % (i, e)) sys.stderr.write('%s\n' % rdict) return num_err_rows + 1 #import pdb; pdb.set_trace() del rdict['place_origin'] del rdict['large1'] del rdict['large2'] del rdict['large3'] del rdict['link'] del rdict['place_english'] # No longer storing these del rdict['old_combined_id'] del rdict['old_source_id'] for k in rdict.keys(): if isinstance(rdict[k], basestring) and not rdict[k]: del rdict[k] for col_name, add_fun in { 'religion' : get_or_add_religion, 'race' : get_or_add_race, 'ethnicity' : get_or_add_ethnicity, 'ethnic_origin' : get_or_add_ethnic_origin, 'population_condition' : get_or_add_pop_cond }.iteritems(): if rdict.has_key(col_name): try: rdict[col_name] = add_fun(unicode(rdict[col_name], migtools.STRING_ENCODING)) except DatabaseError as e: sys.stderr.write("Error on get_or_add_%s in row (%i): %s\n" % (col_name, i, e)) sys.stderr.write("%s\n" % rdict) return num_err_rows + 1 if rdict.has_key('remarks'): rdict['remarks'] = rdict['remarks'].decode(migtools.STRING_ENCODING) if rdict.has_key('alternate_location_name'): rdict['alternate_location_name'] = rdict['alternate_location_name'].decode(migtools.STRING_ENCODING) try: if rdict.has_key('begin_date'): mon, day, year = [int(j) for j in rdict['begin_date'].split('/')] rdict['begin_date'] = datetime.date(year, mon, day) if rdict.has_key('end_date'): mon, day, year = [int(j) for j in rdict['end_date'].split('/')] rdict['end_date'] = datetime.date(year, mon, day) except ValueError as e: sys.stderr.write('Encountered error in date format at row (%i): %s\n' % (i, e)) sys.stderr.write('%s\n' % rdict) return num_err_rows + 1 for age_col in ('age_start', 'age_end'): if rdict.has_key(age_col): if rdict[age_col] in ('Unknown', 'Age unkown', 'Death', 'death'): # Yes, the typo is in the data to migrate del rdict[age_col] elif rdict[age_col] in ('Under 1', 'Total', 'Total all ages', 'All ages'): del rdict['age_start'] if rdict.has_key('age_end'): del rdict['age_end'] break elif rdict[age_col] in ('Not specified','Unspecified', 'Period not indicated'): del rdict[age_col] else: over_match = re.match(r'Over\s(\d+)', rdict[age_col]) if over_match: if rdict.has_key('age_end'): del rdict['age_end'] rdict['age_start'] = over_match.group(1) break under_match = re.match(r'Under\s(\d+)', rdict[age_col]) if under_match: if rdict.has_key('age_start'): del rdict['age_start'] rdict['age_end'] = under_match.group(1) break total_range_match = re.match(r'Total,\s(\d+)-(\d+)', rdict[age_col]) if total_range_match: rdict['age_start'] = total_range_match.group(1) rdict['age_end'] = total_range_match.group(2) break rdict['active'] = True rdict['submitted_by'] = mig_user try: entry = MainDataEntry(**rdict) entry.save() except (ValueError, DatabaseError, ValidationError) as e: sys.stderr.write('Failed to save data row (%i): %s\n' % (i, e)) sys.stderr.write('%s\n' % rdict) num_err_rows += 1 return num_err_rows
rdict['place_english'] = rdict['original_location_name'] try: sys.stdout.write("[%i] %s, %s, %s, %s\n" % (i, rdict['place_english'].decode(migtools.STRING_ENCODING), rdict['large1'].decode(migtools.STRING_ENCODING), rdict['large2'].decode(migtools.STRING_ENCODING), rdict['large3'].decode(migtools.STRING_ENCODING))) except UnicodeEncodeError: # Windows decode error workaround sys.stdout.write("[%i] <UnicodeEncodeError Encountered, ignoring for now>\n" % i) try: rdict['location'] = migtools.get_or_add_location((rdict['place_english'].decode(migtools.STRING_ENCODING), rdict['large1'].decode(migtools.STRING_ENCODING), rdict['large2'].decode(migtools.STRING_ENCODING), rdict['large3'].decode(migtools.STRING_ENCODING), args.allownewloc)) except DatabaseError as e: sys.stderr.write('[%i] Database error on getting or adding location: %s: %s\n' % (i, e, rdict)) num_err_rows += 1 continue except migtools.LocationTooComplicated as e: sys.stderr.write('[%i] Location too complicated: %s: %s\n' % (i, e, rdict)) num_err_rows += 1 continue except Location.DoesNotExist: sys.stderr.write("[%i] No existing location found to match: %s\n" % (i, rdict)) num_err_rows += 1 continue
if rdict['original_language'] and len(rdict['original_language']) != 0: languages = [get_or_add_language(lang, rdict['submitted_by']) for lang in re.split(r'\s*,\s*', rdict['original_language'])] del rdict['original_language'] subjects = None if rdict['subjects'] and len(rdict['subjects']) != 0: subjects = [get_or_add_subject(subject, rdict['submitted_by']) for subject in re.split('\s*,\s*', rdict['subjects'])] del rdict['subjects'] if rdict['included_countries'] and len(rdict['included_countries']) != 0: try: countries = [migtools.get_or_add_location(unicode(country, migtools.STRING_ENCODING), rdict['submitted_by']) for country in re.split('\s*,\s*', rdict['included_countries'])] except migtools.LocationTooComplicated as e: sys.stderr.write('Location too complicated in row (%i): %s\n' % (i, e)) sys.stderr.write('%s\n' % rdict) num_err_rows + 1 continue del rdict['included_countries'] if rdict['begin_year']: rdict['begin_year'] = re.match(r'\d+/\d+/(\d+)', rdict['begin_year']).group(1) else: del rdict['begin_year'] if rdict['end_year']: rdict['end_year'] = re.match(r'\d+/\d+/(\d+)', rdict['end_year']).group(1)