示例#1
0
def get_universities(courses):
    universities = gcat.get_file("WEP Summary Data", sheet="university", fmt="pandas", usecache=True)
    present = set(universities["university_name"])
    needed = set(courses["University"])
    missing = needed - present
    print "missing universities: %s" % missing

    # assume the missing universities have been added
    courses_merged = pd.merge(courses, universities, left_on="University", right_on="university_name")
    # get add the term as its own fields ('fall 2012' -> 'fall')
    courses_merged["Term"] = courses_merged["Term"].map(lambda term_str: term_str.split(" ")[0])
    # print courses_merged
    courses_final = courses_merged[
        [
            "Course Name",
            "Professor Name",
            "university_id",
            "course_language",
            "course_year",
            "course_term",
            "Start Date",
            "End Date",
        ]
    ]
    date_formatter = (
        lambda dstr: dateutil.parser.parse(dstr).strftime("%Y%m%d") if isinstance(dstr, basestring) else dstr
    )
    courses_final["Start Date"] = courses_final["Start Date"].map(date_formatter)
    courses_final["End Date"] = courses_final["End Date"].map(date_formatter)
    # print courses_merged
    return courses_final
示例#2
0
def check_missing_unis(data):
    universities = gcat.get_file('WEP Summary Data', fmt='pandas', sheet='university', usecache=False)

    needed = set(data['university'].unique())
    present = set(universities['university_name'])
    missing = needed - present
    print 'missing: %s' % missing
示例#3
0
文件: get_articles.py 项目: embr/wmf
def get_wep_articles():
    cache_fname = 'wep_articles_ids.cache'
    if os.path.exists(cache_fname):
        return json.load(open(cache_fname))
    else:
        wep_lang = 'en'
        wep = gcat.get_file('WEP Data Project Model', sheet='Article Quality Reviews', fmt='pandas', usecache=True)
        page_titles = list(wep['Article Title'].unique())

        # get page_ids form enwiki db
        host_name = get_host_name(wep_lang)
        db = MySQLdb.connect(host=host_name,
                             read_default_file=os.path.expanduser('~/.my.cnf'),
                             db='%swiki' % wep_lang,
                             cursorclass=MySQLdb.cursors.DictCursor)
        cur = db.cursor(MySQLdb.cursors.DictCursor)
        set_str = ', '.join(['%s']*len(page_titles))
        query = """SELECT * FROM page WHERE page_title IN (%s)""" % set_str
        logger.debug(query, *page_titles)
        cur.execute(query, tuple(page_titles))
        res = cur.fetchall()
        pages = []
        for row in res:
            pages.append({'language' : wep_lang, 'page_id' : row['page_id'], 'title' : row['page_title']})
        logger.debug('len(res): %d' % len(pages))
        json.dump(pages, open(cache_fname, 'w'), indent=2)
        return pages
示例#4
0
文件: get_rows.py 项目: embr/nonce
def get_ambassadors():
    title = 'WEP Data Project Model'
    sheet = 'Ambassador'
    rows = gcat.get_file(title=title, sheet=sheet, usecache=True, fmt='dict')
    rename(rows, 'Type', 'Ambassador Type')
    rename(rows, 'Username', 'Ambassador Username')
    rows = split_rows(rows, 'Course ID', split_fn=ambassador_splitter)
    return rows
示例#5
0
文件: make_csv.py 项目: embr/nonce
def main():
    data = gcat.get_file('WEP Summary Data', fmt='pandas', usecache=False)
    joined = pd.merge(data['course'], data['student'], left_on='course_id', right_on='student_course_id')
    necessary = joined[['student_idstudent_username','course_language','course_startdate','course_coursename']]
    logger.debug('necessary.dtypes: %s', necessary.dtypes)
    necessary['course_startdate'] = necessary['course_startdate'].map(date_cleaner)
    logger.debug(necessary[:10])
    necessary.to_csv('wep_data.csv',encoding='utf-8', index=False)
def get_country_data():
    country_df = gcat.get_file('Global South and Region Classifications', 
            sheet='data',
            fmt='pandas',
            usecache=True)
    country_df = country_df[country_df['ISO-3166 Alpha-2'].notnull()]
    country_df['MaxMind Country'] = country_df['MaxMind Country'].apply(clean_location)
    return country_df
示例#7
0
文件: get_rows.py 项目: embr/nonce
def get_course_meta_data():
    title = 'WEP Data Project Model'
    sheet = 'Course Meta Data'
    rows = gcat.get_file(title=title, sheet=sheet, usecache=True, fmt='dict')
    rows = update_val(rows, 'Start Date', dateutil.parser.parse)
    rows = update_val(rows, 'Start Date', datetime.datetime.date)
    rows = update_val(rows, 'End Date', dateutil.parser.parse)
    rows = update_val(rows, 'End Date', datetime.datetime.date)
    return rows
示例#8
0
文件: get_rows.py 项目: embr/nonce
def get_reviews():
    logging.debug('entering')
    title = 'WEP Data Project Model'
    sheet = 'Article Quality Reviews'
    reviews = gcat.get_file(title=title, sheet=sheet, usecache=True, fmt='dict')
    reviews_by_title = group_by(reviews, 'Article Title')
    review_rows = []
    for title, group in reviews_by_title.items():
        pre = None
        post = None
        new = None
        for row in group:
            if row['Type'] == 'Pre':
                pre = row
            elif row['Type'] == 'Post':
                post = row
            elif row['Type'] == 'New Article':
                new = row
        if new:
            if pre or post:
                logging.warning('found an article with new or old rating')
                continue
            new['new_score'] = float(new['Total'])
            review_rows.append(new)
        if not pre and not post:
            continue
        diff = float(post['Total']) - float(pre['Total'])
        diff_row = pre
        del diff_row['Type']
        diff_row['score_diff'] = diff
        del diff_row['Total']
        review_rows.append(diff_row)

    reviews_by_course = group_by(reviews, 'Course ID')
    course_rows = []
    for course_id, group in reviews_by_course.items():
        diff_rows = []
        new_rows = []
        for row in group:
            if 'score_diff' in row:
                diff_rows.append(row)
            elif 'new_score' in row:
                new_rows.append(row)
        course_row = group[0]
        if diff_rows:
            course_row['Mean(Score Difference)'] = sum(map(op.itemgetter('score_diff'), diff_rows)) / float(len(diff_rows))
        if new_rows:
            course_row['Mean(New Article Scores)'] = sum(map(op.itemgetter('new_score'), new_rows)) / float(len(new_rows))
        if 'score_diff' in course_row:
            del course_row['score_diff']
        elif 'new_score' in course_row:
            del course_row['new_score']
        course_rows.append(course_row)
    logging.debug('found %d review_rows', len(course_rows))
    return course_rows
示例#9
0
def get_course_meta_data(course_names):
    wep_data_courses_df = gcat.get_file(
        "WEP Data Project Model", sheet="Course Meta Data", fmt="pandas", usecache=False
    )
    wep_data = wep_data_courses_df[
        wep_data_courses_df["Course Name"].isin(course_names) & (wep_data_courses_df["Term"] == "Fall 2012")
    ]
    wep_data["course_language"] = "en"
    wep_data["course_year"] = "2012"
    wep_data["course_term"] = "fall"
    return wep_data
def add_global_south(rows):
    meta = gcat.get_file('Global South and Region Classifications', fmt='pandas', sheet='data')
    logger.debug('meta:\n%s', meta)
    labels = dict(meta[['MaxMind Country', 'Global South']].values)

    req_gs = rows['Country of requestor (short form)'].apply(lambda c : labels.get(c,'Unkown Country Name'))
    rows['Global South (requestor)'] = req_gs
    logger.debug('req_gs:\n%s', req_gs)
    impact_gs = rows['Country of impact (short form)'].apply(lambda c : labels.get(c,'Unkown Country Name'))
    rows['Global South (use)'] = impact_gs
    logger.debug('impact_gs:\n%s', impact_gs)
    return rows
示例#11
0
def get_wep_users():
    """ grab usernames from google drive doc """
    wep_wb = gcat.get_file('WEP Summary Data', usecache=True, fmt='pandas')
    full = pd.merge(wep_wb['student'], wep_wb['course'], left_on='student_course_id', right_on='course_id')
    full = pd.merge(full, wep_wb['university'], left_on='course_university_id', right_on='university_id')
    #en_na =full[(full['course_term'] == 'fall') & (full['course_year'] == 2012) & (full['university_country'].isin(['US','Canada']))]
    en_na = full[full['university_country'].isin(['United States','Canada'])]
    users = []
    for idx, row in en_na.iterrows():
        username = username=row['student_idstudent_username']
        users.append(mwstats.DBUser(username=username, project=row['course_language'] + 'wiki'))
    logger.info('starting with %d WEP students', len(users))
    return users
示例#12
0
def get_courses(students):
    courses = gcat.get_file("WEP Summary Data", sheet="course", fmt="pandas", usecache=True)
    courses_filtered = courses[(courses["course_year"] == 2012) & (courses["course_term"] == "fall")]

    present = set(courses_filtered["course_coursename"])
    needed = set(students["course"])
    missing = needed - present
    print "missing courses: %s" % missing

    # assume the missing courses have been addedd
    students_merged = pd.merge(students, courses_filtered, left_on="course", right_on="course_coursename")
    students_final = students_merged[["username", "course_id"]]
    return students_final
def get_grant_data():
    file_title = '(L&E) Grants data FY2012-13_2'
    ex = gcat.get_file(file_title, fmt='pandas_excel', usecache=False)
    df = ex.parse('Grants Data', skip_footer=23).ix[:,:8]
    df['Timing'].replace({
        'Q1-Q2 2012-13' : datetime.date(2012,12,1),
        'Q3-Q4 2012-13' : datetime.date(2013,6,1),
        'Q1-Q2 2013-14' : datetime.date(2013,12,1),
        'Q3-Q4 2013-14' : datetime.date(2014,6,1),
        }, inplace=True)
    df['Global South/North'].replace({'Undefined':'N/A'}, inplace=True)
    df['Timing'] = pd.to_datetime(df['Timing'])
    df['Location'] = df['Location'].apply(clean_location)
    return df
def main():
    opts = parse_args()

    f = gcat.get_file(opts['grants_file'], fmt='pandas_excel', usecache=False)

    all_rows = pd.DataFrame()
    graphs = []
    for sn in ['FY%d%d' % (y-1, y) for y in opts['years']]:
        logger.debug('processing fiscal year: %s', sn)
        rows = f.parse(sn, skiprows=2)
        logger.debug(rows)
        # all_rows = pd.concat([all_rows, clean_rows(rows)])
        all_rows = pd.concat([all_rows, rows])
        all_rows = add_global_south(all_rows)
        
        graphs.extend(write_groups(all_rows))
        graphs.extend(write_total(all_rows))
    
    db = limnpy.Dashboard('grants', 'Wikimedia Grants', 'Dashboard')
    db.add_tab('all', map(lambda g : g.__graph__['id'], graphs))
    db.write(basedir)
示例#15
0
def main():

    """ get the original data and clean it """
    # orig_df = load_rtfs()
    # gcat.put_file('wep_us_canada',df)
    orig_df = gcat.get_file("wep_us_canada", fmt="pandas", usecache=False)

    """ get course meta data """
    orig_courses = set(orig_df["course"])
    courses = get_course_meta_data(orig_courses)

    """ find universities """
    courses_with_uni = get_universities(courses)
    # this file contains the new rows to be added to the course sheet
    # gcat.put_file('tmp_us_canada_courses', courses_final, update=True)

    """ merge original data with the courses """
    students_with_courses = get_courses(orig_df)
    print students_with_courses
    # this file contains new rows that need to be added to the student sheet
    gcat.put_file("tmp_us_canada_students_NEW", students_with_courses, update=True)
示例#16
0
def get_wep_articles():
    cache_fname = "wep_articles_ids.cache"
    if os.path.exists(cache_fname):
        return json.load(open(cache_fname))
    else:
        wep_lang = "en"
        wep = gcat.get_file("WEP Data Project Model", sheet="Article Quality Reviews", fmt="pandas", usecache=True)
        wep = wep[wep["Type"] == "New Article"]
        page_titles = list(wep["Article Title"].unique())
        wiki_page_titles = [title.encode("utf-8").replace(" ", "_") for title in page_titles]

        # get page_ids form enwiki db
        host_name = get_host_name(wep_lang)
        db = MySQLdb.connect(
            host=host_name,
            read_default_file=os.path.expanduser("~/.my.cnf"),
            db="%swiki" % wep_lang,
            cursorclass=MySQLdb.cursors.DictCursor,
            use_unicode=True,
        )
        cur = db.cursor(MySQLdb.cursors.DictCursor)
        set_str = ", ".join(["%s"] * len(wiki_page_titles))
        query = """SELECT * FROM page WHERE page_title IN (%s)""" % set_str
        logger.debug(query, *wiki_page_titles)
        cur.execute(query, tuple(wiki_page_titles))
        res = cur.fetchall()
        logger.debug("len(res): %d", len(res))
        pages = []
        for row in res:
            pages.append({"language": wep_lang, "page_id": row["page_id"], "title": row["page_title"].encode("utf-8")})
        logger.debug("len(missing): %d", len(set(wiki_page_titles) - set(map(itemgetter("title"), pages))))
        logger.debug("missing: %s", set(wiki_page_titles) - set(map(itemgetter("title"), pages)))
        logger.debug("len(res): %d" % len(pages))
        json.dump(pages, open(cache_fname, "w"), indent=2)
        logger.debug("wep: %s", pprint.pformat(pages))
        return pages
示例#17
0
文件: ns_edits.py 项目: embr/nonce
def get_wep_students(lang, cur):
    cache_name = '%s.wep_users.cache.csv' % lang
    if not os.path.exists(cache_name):
        students = gcat.get_file('WEP Summary Data', fmt='pandas', sheet='student')
        usernames = list(students['student_idstudent_username'])
        print 'len(usernames) = %d' % len(usernames)
        query_fmt = u"""SELECT user_name, user_id FROM user WHERE user_name IN (%s)"""
        list_fmt = ', '.join(['%s']*len(usernames))
        #print 'list fmt:'
        #print list_fmt
        query_list_fmt = query_fmt % list_fmt
        #print 'query_list_fmt: %s'
        #print query_list_fmt
        #query_str = query_list_fmt % usernames
        #print '### SUPER OFFICIAL QUERY STRING: %s' % query_str
        cur.execute(query_list_fmt, tuple([u.encode('utf-8') for u in usernames]))
        res = cur.fetchall()
        df = pd.DataFrame(list(res), columns=['user_name', 'user_id'])
        print 'len(df) = %d' % len(df)
        print df
        df.to_csv(cache_name, index=False)
        return df
    df = pd.read_csv(cache_name)
    return df
示例#18
0
文件: get_users.py 项目: embr/nonce
# coding: utf-8
import gcat
import pandas as pd


def expand(df, expand_col, delim=","):
    expand_col_ind = expand_col if isinstance(expand_col, int) else df.columns.find(expand_col)
    for row in df.itertuples(index=False):
        langs = map(unicode.strip, row[expand_col_ind].split(","))
        num_lang = len(langs)
        if num_lang > 1:
            for i in range(num_lang):
                row_copy = list(row)
                row_copy[expand_col_ind] = langs[i]
                yield row_copy
        else:
            yield list(row)


df = gcat.get_file("wikipedia database with username", fmt="pandas", usecache=True)
print len(df)
df["Time Stamp"] = pd.to_datetime(df["Time Stamp"])
df = df[df["Time Stamp"].notnull()]
print len(df)

df = pd.DataFrame(list(expand(df, 3)), columns=df.columns)

print len(df)
df.to_csv("users.csv", encoding="utf-8", index=False)
示例#19
0
文件: upload_all.py 项目: embr/nonce
import gcat
import pandas as pd

frank = gcat.get_file('tmp_frank_students',fmt='pandas', sheet='student')
jami = gcat.get_file('tmp_us_canada_students_NEW',fmt='pandas')
annie = gcat.get_file('tmp_ar_students', fmt='pandas')
tom = gcat.get_file('tmp_brazil_student', fmt='pandas')

jami = jami.rename(columns={'username' : 'student_idstudent_username', 'course_id' : 'student_course_id'})
annie = annie.rename(columns={'course_id' : 'student_course_id'})
tom = tom.rename(columns={'student' : 'student_idstudent_username', 'course_id' : 'student_course_id'})


print frank
print jami
print annie
print tom

all_students = pd.concat([frank, jami, annie, tom])
all_students = all_students[['student_idstudent_username','student_course_id','student_lastname','student_firstname','student_email']]

final = gcat.get_file('WEP Summary Data', fmt='pandas')
final['student'] = all_students
gcat.put_file('WEP Summary Data', final, update=True)
示例#20
0
文件: get_rows.py 项目: embr/nonce
def get_survey_data():
    logging.debug('entering')
    title = 'WEP Data Project Model'
    sheet = 'Survey Course Data'
    rows = gcat.get_file(title=title, sheet=sheet, usecache=True, fmt='dict')
    return rows
示例#21
0
def get_courses(orig_df):
    universities = gcat.get_file('WEP Summary Data', fmt='pandas', sheet='university', usecache=False)

    course_final = pd.merge(orig_df, universities, left_on='university', right_on='university_name')
    course_final = course_final[['university_id', 'course']].drop_duplicates()
    return course_final
示例#22
0
文件: get_rows.py 项目: embr/nonce
def get_public_course_data():
    title = 'WEP Data Project Model'
    sheet = 'Public Course Data'
    rows = gcat.get_file(title=title, sheet=sheet, usecache=True, fmt='dict')
    binarize(rows, 'Course Subject Area')
    return rows
示例#23
0
def get_students(orig_df):
    courses = gcat.get_file('WEP Summary Data', fmt='pandas', sheet='course', usecache=False)
    student_final = pd.merge(orig_df, courses, left_on='course', right_on='course_coursename')
    student_final = student_final[['student', 'course_id']]
    return student_final