def get_universities(courses): universities = gcat.get_file("WEP Summary Data", sheet="university", fmt="pandas", usecache=True) present = set(universities["university_name"]) needed = set(courses["University"]) missing = needed - present print "missing universities: %s" % missing # assume the missing universities have been added courses_merged = pd.merge(courses, universities, left_on="University", right_on="university_name") # get add the term as its own fields ('fall 2012' -> 'fall') courses_merged["Term"] = courses_merged["Term"].map(lambda term_str: term_str.split(" ")[0]) # print courses_merged courses_final = courses_merged[ [ "Course Name", "Professor Name", "university_id", "course_language", "course_year", "course_term", "Start Date", "End Date", ] ] date_formatter = ( lambda dstr: dateutil.parser.parse(dstr).strftime("%Y%m%d") if isinstance(dstr, basestring) else dstr ) courses_final["Start Date"] = courses_final["Start Date"].map(date_formatter) courses_final["End Date"] = courses_final["End Date"].map(date_formatter) # print courses_merged return courses_final
def check_missing_unis(data): universities = gcat.get_file('WEP Summary Data', fmt='pandas', sheet='university', usecache=False) needed = set(data['university'].unique()) present = set(universities['university_name']) missing = needed - present print 'missing: %s' % missing
def get_wep_articles(): cache_fname = 'wep_articles_ids.cache' if os.path.exists(cache_fname): return json.load(open(cache_fname)) else: wep_lang = 'en' wep = gcat.get_file('WEP Data Project Model', sheet='Article Quality Reviews', fmt='pandas', usecache=True) page_titles = list(wep['Article Title'].unique()) # get page_ids form enwiki db host_name = get_host_name(wep_lang) db = MySQLdb.connect(host=host_name, read_default_file=os.path.expanduser('~/.my.cnf'), db='%swiki' % wep_lang, cursorclass=MySQLdb.cursors.DictCursor) cur = db.cursor(MySQLdb.cursors.DictCursor) set_str = ', '.join(['%s']*len(page_titles)) query = """SELECT * FROM page WHERE page_title IN (%s)""" % set_str logger.debug(query, *page_titles) cur.execute(query, tuple(page_titles)) res = cur.fetchall() pages = [] for row in res: pages.append({'language' : wep_lang, 'page_id' : row['page_id'], 'title' : row['page_title']}) logger.debug('len(res): %d' % len(pages)) json.dump(pages, open(cache_fname, 'w'), indent=2) return pages
def get_ambassadors(): title = 'WEP Data Project Model' sheet = 'Ambassador' rows = gcat.get_file(title=title, sheet=sheet, usecache=True, fmt='dict') rename(rows, 'Type', 'Ambassador Type') rename(rows, 'Username', 'Ambassador Username') rows = split_rows(rows, 'Course ID', split_fn=ambassador_splitter) return rows
def main(): data = gcat.get_file('WEP Summary Data', fmt='pandas', usecache=False) joined = pd.merge(data['course'], data['student'], left_on='course_id', right_on='student_course_id') necessary = joined[['student_idstudent_username','course_language','course_startdate','course_coursename']] logger.debug('necessary.dtypes: %s', necessary.dtypes) necessary['course_startdate'] = necessary['course_startdate'].map(date_cleaner) logger.debug(necessary[:10]) necessary.to_csv('wep_data.csv',encoding='utf-8', index=False)
def get_country_data(): country_df = gcat.get_file('Global South and Region Classifications', sheet='data', fmt='pandas', usecache=True) country_df = country_df[country_df['ISO-3166 Alpha-2'].notnull()] country_df['MaxMind Country'] = country_df['MaxMind Country'].apply(clean_location) return country_df
def get_course_meta_data(): title = 'WEP Data Project Model' sheet = 'Course Meta Data' rows = gcat.get_file(title=title, sheet=sheet, usecache=True, fmt='dict') rows = update_val(rows, 'Start Date', dateutil.parser.parse) rows = update_val(rows, 'Start Date', datetime.datetime.date) rows = update_val(rows, 'End Date', dateutil.parser.parse) rows = update_val(rows, 'End Date', datetime.datetime.date) return rows
def get_reviews(): logging.debug('entering') title = 'WEP Data Project Model' sheet = 'Article Quality Reviews' reviews = gcat.get_file(title=title, sheet=sheet, usecache=True, fmt='dict') reviews_by_title = group_by(reviews, 'Article Title') review_rows = [] for title, group in reviews_by_title.items(): pre = None post = None new = None for row in group: if row['Type'] == 'Pre': pre = row elif row['Type'] == 'Post': post = row elif row['Type'] == 'New Article': new = row if new: if pre or post: logging.warning('found an article with new or old rating') continue new['new_score'] = float(new['Total']) review_rows.append(new) if not pre and not post: continue diff = float(post['Total']) - float(pre['Total']) diff_row = pre del diff_row['Type'] diff_row['score_diff'] = diff del diff_row['Total'] review_rows.append(diff_row) reviews_by_course = group_by(reviews, 'Course ID') course_rows = [] for course_id, group in reviews_by_course.items(): diff_rows = [] new_rows = [] for row in group: if 'score_diff' in row: diff_rows.append(row) elif 'new_score' in row: new_rows.append(row) course_row = group[0] if diff_rows: course_row['Mean(Score Difference)'] = sum(map(op.itemgetter('score_diff'), diff_rows)) / float(len(diff_rows)) if new_rows: course_row['Mean(New Article Scores)'] = sum(map(op.itemgetter('new_score'), new_rows)) / float(len(new_rows)) if 'score_diff' in course_row: del course_row['score_diff'] elif 'new_score' in course_row: del course_row['new_score'] course_rows.append(course_row) logging.debug('found %d review_rows', len(course_rows)) return course_rows
def get_course_meta_data(course_names): wep_data_courses_df = gcat.get_file( "WEP Data Project Model", sheet="Course Meta Data", fmt="pandas", usecache=False ) wep_data = wep_data_courses_df[ wep_data_courses_df["Course Name"].isin(course_names) & (wep_data_courses_df["Term"] == "Fall 2012") ] wep_data["course_language"] = "en" wep_data["course_year"] = "2012" wep_data["course_term"] = "fall" return wep_data
def add_global_south(rows): meta = gcat.get_file('Global South and Region Classifications', fmt='pandas', sheet='data') logger.debug('meta:\n%s', meta) labels = dict(meta[['MaxMind Country', 'Global South']].values) req_gs = rows['Country of requestor (short form)'].apply(lambda c : labels.get(c,'Unkown Country Name')) rows['Global South (requestor)'] = req_gs logger.debug('req_gs:\n%s', req_gs) impact_gs = rows['Country of impact (short form)'].apply(lambda c : labels.get(c,'Unkown Country Name')) rows['Global South (use)'] = impact_gs logger.debug('impact_gs:\n%s', impact_gs) return rows
def get_wep_users(): """ grab usernames from google drive doc """ wep_wb = gcat.get_file('WEP Summary Data', usecache=True, fmt='pandas') full = pd.merge(wep_wb['student'], wep_wb['course'], left_on='student_course_id', right_on='course_id') full = pd.merge(full, wep_wb['university'], left_on='course_university_id', right_on='university_id') #en_na =full[(full['course_term'] == 'fall') & (full['course_year'] == 2012) & (full['university_country'].isin(['US','Canada']))] en_na = full[full['university_country'].isin(['United States','Canada'])] users = [] for idx, row in en_na.iterrows(): username = username=row['student_idstudent_username'] users.append(mwstats.DBUser(username=username, project=row['course_language'] + 'wiki')) logger.info('starting with %d WEP students', len(users)) return users
def get_courses(students): courses = gcat.get_file("WEP Summary Data", sheet="course", fmt="pandas", usecache=True) courses_filtered = courses[(courses["course_year"] == 2012) & (courses["course_term"] == "fall")] present = set(courses_filtered["course_coursename"]) needed = set(students["course"]) missing = needed - present print "missing courses: %s" % missing # assume the missing courses have been addedd students_merged = pd.merge(students, courses_filtered, left_on="course", right_on="course_coursename") students_final = students_merged[["username", "course_id"]] return students_final
def get_grant_data(): file_title = '(L&E) Grants data FY2012-13_2' ex = gcat.get_file(file_title, fmt='pandas_excel', usecache=False) df = ex.parse('Grants Data', skip_footer=23).ix[:,:8] df['Timing'].replace({ 'Q1-Q2 2012-13' : datetime.date(2012,12,1), 'Q3-Q4 2012-13' : datetime.date(2013,6,1), 'Q1-Q2 2013-14' : datetime.date(2013,12,1), 'Q3-Q4 2013-14' : datetime.date(2014,6,1), }, inplace=True) df['Global South/North'].replace({'Undefined':'N/A'}, inplace=True) df['Timing'] = pd.to_datetime(df['Timing']) df['Location'] = df['Location'].apply(clean_location) return df
def main(): opts = parse_args() f = gcat.get_file(opts['grants_file'], fmt='pandas_excel', usecache=False) all_rows = pd.DataFrame() graphs = [] for sn in ['FY%d%d' % (y-1, y) for y in opts['years']]: logger.debug('processing fiscal year: %s', sn) rows = f.parse(sn, skiprows=2) logger.debug(rows) # all_rows = pd.concat([all_rows, clean_rows(rows)]) all_rows = pd.concat([all_rows, rows]) all_rows = add_global_south(all_rows) graphs.extend(write_groups(all_rows)) graphs.extend(write_total(all_rows)) db = limnpy.Dashboard('grants', 'Wikimedia Grants', 'Dashboard') db.add_tab('all', map(lambda g : g.__graph__['id'], graphs)) db.write(basedir)
def main(): """ get the original data and clean it """ # orig_df = load_rtfs() # gcat.put_file('wep_us_canada',df) orig_df = gcat.get_file("wep_us_canada", fmt="pandas", usecache=False) """ get course meta data """ orig_courses = set(orig_df["course"]) courses = get_course_meta_data(orig_courses) """ find universities """ courses_with_uni = get_universities(courses) # this file contains the new rows to be added to the course sheet # gcat.put_file('tmp_us_canada_courses', courses_final, update=True) """ merge original data with the courses """ students_with_courses = get_courses(orig_df) print students_with_courses # this file contains new rows that need to be added to the student sheet gcat.put_file("tmp_us_canada_students_NEW", students_with_courses, update=True)
def get_wep_articles(): cache_fname = "wep_articles_ids.cache" if os.path.exists(cache_fname): return json.load(open(cache_fname)) else: wep_lang = "en" wep = gcat.get_file("WEP Data Project Model", sheet="Article Quality Reviews", fmt="pandas", usecache=True) wep = wep[wep["Type"] == "New Article"] page_titles = list(wep["Article Title"].unique()) wiki_page_titles = [title.encode("utf-8").replace(" ", "_") for title in page_titles] # get page_ids form enwiki db host_name = get_host_name(wep_lang) db = MySQLdb.connect( host=host_name, read_default_file=os.path.expanduser("~/.my.cnf"), db="%swiki" % wep_lang, cursorclass=MySQLdb.cursors.DictCursor, use_unicode=True, ) cur = db.cursor(MySQLdb.cursors.DictCursor) set_str = ", ".join(["%s"] * len(wiki_page_titles)) query = """SELECT * FROM page WHERE page_title IN (%s)""" % set_str logger.debug(query, *wiki_page_titles) cur.execute(query, tuple(wiki_page_titles)) res = cur.fetchall() logger.debug("len(res): %d", len(res)) pages = [] for row in res: pages.append({"language": wep_lang, "page_id": row["page_id"], "title": row["page_title"].encode("utf-8")}) logger.debug("len(missing): %d", len(set(wiki_page_titles) - set(map(itemgetter("title"), pages)))) logger.debug("missing: %s", set(wiki_page_titles) - set(map(itemgetter("title"), pages))) logger.debug("len(res): %d" % len(pages)) json.dump(pages, open(cache_fname, "w"), indent=2) logger.debug("wep: %s", pprint.pformat(pages)) return pages
def get_wep_students(lang, cur): cache_name = '%s.wep_users.cache.csv' % lang if not os.path.exists(cache_name): students = gcat.get_file('WEP Summary Data', fmt='pandas', sheet='student') usernames = list(students['student_idstudent_username']) print 'len(usernames) = %d' % len(usernames) query_fmt = u"""SELECT user_name, user_id FROM user WHERE user_name IN (%s)""" list_fmt = ', '.join(['%s']*len(usernames)) #print 'list fmt:' #print list_fmt query_list_fmt = query_fmt % list_fmt #print 'query_list_fmt: %s' #print query_list_fmt #query_str = query_list_fmt % usernames #print '### SUPER OFFICIAL QUERY STRING: %s' % query_str cur.execute(query_list_fmt, tuple([u.encode('utf-8') for u in usernames])) res = cur.fetchall() df = pd.DataFrame(list(res), columns=['user_name', 'user_id']) print 'len(df) = %d' % len(df) print df df.to_csv(cache_name, index=False) return df df = pd.read_csv(cache_name) return df
# coding: utf-8 import gcat import pandas as pd def expand(df, expand_col, delim=","): expand_col_ind = expand_col if isinstance(expand_col, int) else df.columns.find(expand_col) for row in df.itertuples(index=False): langs = map(unicode.strip, row[expand_col_ind].split(",")) num_lang = len(langs) if num_lang > 1: for i in range(num_lang): row_copy = list(row) row_copy[expand_col_ind] = langs[i] yield row_copy else: yield list(row) df = gcat.get_file("wikipedia database with username", fmt="pandas", usecache=True) print len(df) df["Time Stamp"] = pd.to_datetime(df["Time Stamp"]) df = df[df["Time Stamp"].notnull()] print len(df) df = pd.DataFrame(list(expand(df, 3)), columns=df.columns) print len(df) df.to_csv("users.csv", encoding="utf-8", index=False)
import gcat import pandas as pd frank = gcat.get_file('tmp_frank_students',fmt='pandas', sheet='student') jami = gcat.get_file('tmp_us_canada_students_NEW',fmt='pandas') annie = gcat.get_file('tmp_ar_students', fmt='pandas') tom = gcat.get_file('tmp_brazil_student', fmt='pandas') jami = jami.rename(columns={'username' : 'student_idstudent_username', 'course_id' : 'student_course_id'}) annie = annie.rename(columns={'course_id' : 'student_course_id'}) tom = tom.rename(columns={'student' : 'student_idstudent_username', 'course_id' : 'student_course_id'}) print frank print jami print annie print tom all_students = pd.concat([frank, jami, annie, tom]) all_students = all_students[['student_idstudent_username','student_course_id','student_lastname','student_firstname','student_email']] final = gcat.get_file('WEP Summary Data', fmt='pandas') final['student'] = all_students gcat.put_file('WEP Summary Data', final, update=True)
def get_survey_data(): logging.debug('entering') title = 'WEP Data Project Model' sheet = 'Survey Course Data' rows = gcat.get_file(title=title, sheet=sheet, usecache=True, fmt='dict') return rows
def get_courses(orig_df): universities = gcat.get_file('WEP Summary Data', fmt='pandas', sheet='university', usecache=False) course_final = pd.merge(orig_df, universities, left_on='university', right_on='university_name') course_final = course_final[['university_id', 'course']].drop_duplicates() return course_final
def get_public_course_data(): title = 'WEP Data Project Model' sheet = 'Public Course Data' rows = gcat.get_file(title=title, sheet=sheet, usecache=True, fmt='dict') binarize(rows, 'Course Subject Area') return rows
def get_students(orig_df): courses = gcat.get_file('WEP Summary Data', fmt='pandas', sheet='course', usecache=False) student_final = pd.merge(orig_df, courses, left_on='course', right_on='course_coursename') student_final = student_final[['student', 'course_id']] return student_final