def load(source, s3=False): """ Loads program data from a local or S3 file. For a local file, 'source' should be a CSV file path. For an s3 file, 'source' should be the file name of a CSV in the 'validated_program_data' folder on s3. """ test_program = False new_programs = 0 updated_programs = 0 FAILED = [] # failed messages if s3: s3_url = ('https://files.consumerfinance.gov' '/pb/paying_for_college/csv/validated_program_data/{}') raw_data = read_in_s3(s3_url.format(source)) else: raw_data = read_in_data(source) if not raw_data[0]: return (["ERROR: could not read data from {0}".format(source)], "") for row in raw_data: if 'test' in row.keys() and row['test'].lower() == 'true': test_program = True fixed_data = clean(row) serializer = ProgramSerializer(data=fixed_data) if serializer.is_valid(): data = serializer.validated_data if not validate_pid(data['program_code']): print("ERROR: invalid program code: " "{}".format(data['program_code'])) continue (school, error) = get_school(data['ipeds_unit_id']) if error: print(error) continue program, cr = Program.objects.get_or_create( institution=school, program_code=data['program_code']) if cr: new_programs += 1 else: updated_programs += 1 program.accreditor = data['accreditor'] program.cip_code = data['cip_code'] program.completion_rate = data['completion_rate'] program.default_rate = data['default_rate'] program.mean_student_loan_completers = data['mean_student_' 'loan_completers'] program.median_student_loan_completers = data['median_student_' 'loan_completers'] program.program_code = data['program_code'] program.program_name = strip_control_chars(data['program_name']) program.program_length = data['program_length'] # program.soc_codes = data['soc_codes'] program.total_cost = data['total_cost'] program.campus = strip_control_chars(data['campus_name']) program.level = data['program_level'] program.time_to_complete = data['average_time_to_complete'] program.salary = data['median_salary'] program.job_rate = data['job_placement_rate'] program.job_note = data['job_placement_note'] program.tuition = data['tuition_fees'] program.books = data['books_supplies'] program.completers = data['completers'] program.completion_cohort = data['completion_cohort'] program.test = test_program program.save() else: # There is error for key, error_list in dict.items(serializer.errors): fail_msg = ('ERROR on row {}: {}: '.format( raw_data.index(row) + 1, key)) for e in error_list: fail_msg = '{} {},'.format(fail_msg, e) FAILED.append(fail_msg) endmsg = ('{} programs created. ' '{} programs updated.'.format(new_programs, updated_programs)) return (FAILED, endmsg)
def test_validate_pid(self): # bad_chars = [';', '<', '>', '{', '}'] self.assertFalse(validate_pid('490<script>')) self.assertFalse(validate_pid('{value}')) self.assertFalse(validate_pid('DROP TABLE;')) self.assertTrue(validate_pid('108b'))
def test_validate_pid(self): # bad_chars = [';', '<', '>', '{', '}'] self.assertFalse(validate_pid("490<script>")) self.assertFalse(validate_pid("{value}")) self.assertFalse(validate_pid("DROP TABLE;")) self.assertTrue(validate_pid("108b"))
def load(source, s3=False): """ Loads program data from a local or S3 file. For a local file, 'source' should be a CSV file path. For an s3 file, 'source' should be the file name of a CSV in the 'validated_program_data' folder on s3. """ test_program = False new_programs = 0 updated_programs = 0 FAILED = [] # failed messages if s3: s3_url = ('http://files.consumerfinance.gov.s3.amazonaws.com' '/pb/paying_for_college/csv/validated_program_data/{}') raw_data = read_in_s3(s3_url.format(source)) else: raw_data = read_in_data(source) if not raw_data[0]: return (["ERROR: could not read data from {0}".format(source)], "") for row in raw_data: if 'test' in row.keys() and row['test'].lower() == 'true': test_program = True fixed_data = clean(row) serializer = ProgramSerializer(data=fixed_data) if serializer.is_valid(): data = serializer.validated_data if not validate_pid(data['program_code']): print("ERROR: invalid program code: " "{}".format(data['program_code'])) continue (school, error) = get_school(data['ipeds_unit_id']) if error: print(error) continue program, cr = Program.objects.get_or_create( institution=school, program_code=data['program_code'] ) if cr: new_programs += 1 else: updated_programs += 1 program.accreditor = data['accreditor'] program.cip_code = data['cip_code'] program.completion_rate = data['completion_rate'] program.default_rate = data['default_rate'] program.mean_student_loan_completers = data['mean_student_' 'loan_completers'] program.median_student_loan_completers = data['median_student_' 'loan_completers'] program.program_code = data['program_code'] program.program_name = strip_control_chars(data['program_name']) program.program_length = data['program_length'] # program.soc_codes = data['soc_codes'] program.total_cost = data['total_cost'] program.campus = strip_control_chars(data['campus_name']) program.level = data['program_level'] program.time_to_complete = data['average_time_to_complete'] program.salary = data['median_salary'] program.job_rate = data['job_placement_rate'] program.job_note = data['job_placement_note'] program.tuition = data['tuition_fees'] program.books = data['books_supplies'] program.completers = data['completers'] program.completion_cohort = data['completion_cohort'] program.test = test_program program.save() else: # There is error for key, error_list in serializer.errors.iteritems(): fail_msg = ( 'ERROR on row {}: {}: '.format( raw_data.index(row) + 1, key)) for e in error_list: fail_msg = '{} {},'.format(fail_msg, e) FAILED.append(fail_msg) endmsg = ('{} programs created. ' '{} programs updated.'.format(new_programs, updated_programs)) return (FAILED, endmsg)