def main(): orig_df = get_data() check_missing_unis(orig_df) #assumes unis have been added and assigned university_ids courses = get_courses(orig_df) gcat.put_file('tmp_brazil_courses', courses, update=True) #assumes courses have been added and assigned course_ids students = get_students(orig_df) gcat.put_file('tmp_brazil_student', students, update=True)
def main(): """ get the original data and clean it """ # orig_df = load_rtfs() # gcat.put_file('wep_us_canada',df) orig_df = gcat.get_file("wep_us_canada", fmt="pandas", usecache=False) """ get course meta data """ orig_courses = set(orig_df["course"]) courses = get_course_meta_data(orig_courses) """ find universities """ courses_with_uni = get_universities(courses) # this file contains the new rows to be added to the course sheet # gcat.put_file('tmp_us_canada_courses', courses_final, update=True) """ merge original data with the courses """ students_with_courses = get_courses(orig_df) print students_with_courses # this file contains new rows that need to be added to the student sheet gcat.put_file("tmp_us_canada_students_NEW", students_with_courses, update=True)
import gcat import pandas as pd frank = gcat.get_file('tmp_frank_students',fmt='pandas', sheet='student') jami = gcat.get_file('tmp_us_canada_students_NEW',fmt='pandas') annie = gcat.get_file('tmp_ar_students', fmt='pandas') tom = gcat.get_file('tmp_brazil_student', fmt='pandas') jami = jami.rename(columns={'username' : 'student_idstudent_username', 'course_id' : 'student_course_id'}) annie = annie.rename(columns={'course_id' : 'student_course_id'}) tom = tom.rename(columns={'student' : 'student_idstudent_username', 'course_id' : 'student_course_id'}) print frank print jami print annie print tom all_students = pd.concat([frank, jami, annie, tom]) all_students = all_students[['student_idstudent_username','student_course_id','student_lastname','student_firstname','student_email']] final = gcat.get_file('WEP Summary Data', fmt='pandas') final['student'] = all_students gcat.put_file('WEP Summary Data', final, update=True)
'university_city', 'university_country'] course_fields = ['course_id', 'course_coursename', 'course_instructor', 'course_university_id', 'course_language', 'course_year', 'course_term', 'course_startdate', 'course_enddate'] student_fields = ['student_id' 'student_username', 'student_course_id', 'student_lastname', 'student_firstname', 'student_email'] data = { 'university' : pd.read_table('frank_db/Education Program_2013-01-09_universities.csv', sep=';', names=univ_fields), 'course' : pd.read_table('frank_db/Education Program_2013-01-09_courses.csv', sep=';', names=course_fields), 'student' : pd.read_table('frank_db/Education Program_2013-01-09_students.csv', sep=';', names=student_fields), } print data dest_name = 'tmp_frank_students' gcat.put_file(dest_name, data, update=True)