Пример #1
0
def gc_scatter_correlation_matrix(raw_dataframe, diagonal='kde',
                                  annotate_correlation=False):
    #
    # Filter non-numeric columns
    # Empty columns will also be dropped.
    numeric_colnames = []
    for colname in raw_dataframe.columns:
        col = dat[colname].dropna()
        if (len(col) > 0) and (col.dtype == 'float64'):
            numeric_colnames.append(colname)
    
    dframe = raw_dataframe.reindex(columns=numeric_colnames)
    colnames = dframe.columns
    num_cols = len(colnames)
    
    #
    # Find grade ranges (0 to max mark)
    grade_ranges = []
    for colname in dframe.columns:
        max_grade = re.match("^.*\[Total Pts:( up to)? (\d+).*$", colname).group(2)
        max_grade = float(max_grade)
        grade_ranges.append((0,max_grade))

    #
    # Rename to filter out GradeCentre chaff
    def transform(s):
        return re.match("^(.*?)( \[.*$)", s).group(1)
    
    dframe = dframe.rename(columns=transform)

    #
    # Rename columns with same names
    cnts = collections.Counter(dframe.columns)
    dupes = {k:v for k,v in cnts.iteritems() if v>=2}
    for name, _ in dupes.iteritems():
        dupe_indxs = dframe.columns.get_loc(name).nonzero()[0]  # indexes of duplicates
        i = 0
        for indx in dupe_indxs:
            i +=1
            prev_cols = list(dframe.columns)
            prev_cols[indx] = "%s (%d)" % (name, i)
            dframe.columns = prev_cols

    #
    # Create grade range dictionary
    ranges_dict = {}
    for indx, colname in enumerate(dframe.columns):
        ranges_dict = grade_ranges[indx]

    fig = scatter_correlation_matrix(dframe, diagonal=diagonal,
                                     annotate_correlation=annotate_correlation,
                                     ranges=ranges_dict)
    return fig
Пример #2
0
def scm_for_worksheet(df_raw,
                      diagonal='kde',
                      annotate_correlation=True,
                      colour_correlation=True):
    """
    df_raw: dataframe representing worksheet for a particular module.
    """

    cols = df_raw.columns
    ncols = len(cols)

    # Obtain subset of cols
    indxs = [2] + range(4, ncols)
    # 2 = module marks (weighted total)
    # range(4,ncols) = individual courseworks
    df = df_raw.iloc[:, indxs]  #iloc: = 0, n-1

    # Drop first row ("Mark", ...)
    #df = df.drop([0])
    df = df.iloc[1:, :]
    df = df.reset_index()

    # Drop mystical 'index' col
    df = df.drop('index', 1)

    # Change dtypes
    for cname in df.columns:
        df[cname] = df[cname].astype(float)

    # Set grade ranges for each coursework
    # (Assume all webmark grades are always 0 to 100)
    ranges_dict = {}
    for cname in df.columns:
        ranges_dict[cname] = (0.0, 100.0)

    # Plot
    fig = scatter_correlation_matrix(df,
                                     diagonal=diagonal,
                                     annotate_correlation=annotate_correlation,
                                     colour_correlation=colour_correlation,
                                     ranges=ranges_dict)
    return fig
Пример #3
0
def scm_for_worksheet(df_raw, diagonal='kde', annotate_correlation=True,
                      colour_correlation=True):
    """
    df_raw: dataframe representing worksheet for a particular module.
    """

    cols = df_raw.columns
    ncols = len(cols)

    # Obtain subset of cols
    indxs = [2] + range(4, ncols)
        # 2 = module marks (weighted total)
        # range(4,ncols) = individual courseworks
    df = df_raw.iloc[:,indxs]  #iloc: = 0, n-1

    # Drop first row ("Mark", ...)
    #df = df.drop([0])
    df = df.iloc[1:,:]
    df = df.reset_index()

    # Drop mystical 'index' col
    df = df.drop('index', 1)

    # Change dtypes
    for cname in df.columns:
        df[cname] = df[cname].astype(float)


    # Set grade ranges for each coursework
    # (Assume all webmark grades are always 0 to 100)
    ranges_dict = {}
    for cname in df.columns:
        ranges_dict[cname] = (0.0, 100.0)

    # Plot
    fig = scatter_correlation_matrix(df, diagonal=diagonal,
                                     annotate_correlation=annotate_correlation,
                                     colour_correlation=colour_correlation,
                                     ranges=ranges_dict)
    return fig