def gc_scatter_correlation_matrix(raw_dataframe, diagonal='kde', annotate_correlation=False): # # Filter non-numeric columns # Empty columns will also be dropped. numeric_colnames = [] for colname in raw_dataframe.columns: col = dat[colname].dropna() if (len(col) > 0) and (col.dtype == 'float64'): numeric_colnames.append(colname) dframe = raw_dataframe.reindex(columns=numeric_colnames) colnames = dframe.columns num_cols = len(colnames) # # Find grade ranges (0 to max mark) grade_ranges = [] for colname in dframe.columns: max_grade = re.match("^.*\[Total Pts:( up to)? (\d+).*$", colname).group(2) max_grade = float(max_grade) grade_ranges.append((0,max_grade)) # # Rename to filter out GradeCentre chaff def transform(s): return re.match("^(.*?)( \[.*$)", s).group(1) dframe = dframe.rename(columns=transform) # # Rename columns with same names cnts = collections.Counter(dframe.columns) dupes = {k:v for k,v in cnts.iteritems() if v>=2} for name, _ in dupes.iteritems(): dupe_indxs = dframe.columns.get_loc(name).nonzero()[0] # indexes of duplicates i = 0 for indx in dupe_indxs: i +=1 prev_cols = list(dframe.columns) prev_cols[indx] = "%s (%d)" % (name, i) dframe.columns = prev_cols # # Create grade range dictionary ranges_dict = {} for indx, colname in enumerate(dframe.columns): ranges_dict = grade_ranges[indx] fig = scatter_correlation_matrix(dframe, diagonal=diagonal, annotate_correlation=annotate_correlation, ranges=ranges_dict) return fig
def scm_for_worksheet(df_raw, diagonal='kde', annotate_correlation=True, colour_correlation=True): """ df_raw: dataframe representing worksheet for a particular module. """ cols = df_raw.columns ncols = len(cols) # Obtain subset of cols indxs = [2] + range(4, ncols) # 2 = module marks (weighted total) # range(4,ncols) = individual courseworks df = df_raw.iloc[:, indxs] #iloc: = 0, n-1 # Drop first row ("Mark", ...) #df = df.drop([0]) df = df.iloc[1:, :] df = df.reset_index() # Drop mystical 'index' col df = df.drop('index', 1) # Change dtypes for cname in df.columns: df[cname] = df[cname].astype(float) # Set grade ranges for each coursework # (Assume all webmark grades are always 0 to 100) ranges_dict = {} for cname in df.columns: ranges_dict[cname] = (0.0, 100.0) # Plot fig = scatter_correlation_matrix(df, diagonal=diagonal, annotate_correlation=annotate_correlation, colour_correlation=colour_correlation, ranges=ranges_dict) return fig
def scm_for_worksheet(df_raw, diagonal='kde', annotate_correlation=True, colour_correlation=True): """ df_raw: dataframe representing worksheet for a particular module. """ cols = df_raw.columns ncols = len(cols) # Obtain subset of cols indxs = [2] + range(4, ncols) # 2 = module marks (weighted total) # range(4,ncols) = individual courseworks df = df_raw.iloc[:,indxs] #iloc: = 0, n-1 # Drop first row ("Mark", ...) #df = df.drop([0]) df = df.iloc[1:,:] df = df.reset_index() # Drop mystical 'index' col df = df.drop('index', 1) # Change dtypes for cname in df.columns: df[cname] = df[cname].astype(float) # Set grade ranges for each coursework # (Assume all webmark grades are always 0 to 100) ranges_dict = {} for cname in df.columns: ranges_dict[cname] = (0.0, 100.0) # Plot fig = scatter_correlation_matrix(df, diagonal=diagonal, annotate_correlation=annotate_correlation, colour_correlation=colour_correlation, ranges=ranges_dict) return fig