def PlotSupportVectorsOverlap(root, output_file, data_file=None): file_template = root + '/i-0_t-80_T-20/i-0_t-80_ts-{0}/svm/cvresults/cvresults.model' y = [] x = [] for i in np.arange(30, 110, 10): file1 = file_template.format(str(i - 10)) file2 = file_template.format(str(i)) s1 = u.ReadBinaryFile(file1).support_ s2 = u.ReadBinaryFile(file2).support_ _y = len(set(s1).intersection(s2)) / len(s1) _x = i y.append(_y) x.append(_x) outputfile = root + "/" + output_file u.SaveDataPlotWithLegends( [u.YSeries(y)], x, outputfile, x_axis_name="Train size % used", y1_axis_name="Common support vectors fraction wrt previous size %", y_limits=[0, 1]) if (data_file is not None): pd.DataFrame({ 'size %': x, 'overlap': y }).to_csv(root + '/' + data_file, index=False)
def PlotAvgRewardsPerEpisode(data, totalpoints, points_to_sample, outputfile, y_axis_name, key_to_plot, max_points=100000): """ cr : cum_rewards ar : avg_rewards len : episode_len goal : reached_goal """ data_to_plot = u.FilterRows( data, lambda x: (x['solver'] == 'q') & (x['gamma'] == 0.99) & (x['alpha'] == 1) & (x['maxInnerVi'] == totalpoints)) x_to_take = np.arange(totalpoints) x_to_take = x_to_take[x_to_take % points_to_sample == 0] x_to_take = x_to_take[x_to_take < max_points] ser = data_to_plot.apply( lambda x: GetQRewardSeriesToPlot(x, x_to_take, key_to_plot), axis=1) u.SaveDataPlotWithLegends(ser.values, filename=outputfile, x_axis_name="episodes", y1_axis_name=y_axis_name)
def PlotMimicProbabilities(inputfile, outputfile, topk, size): data = pd.read_csv(inputfile) data = data[data['size'] == size].head(topk) y_ser = [] ser = u.YSeries(data['n2'], line_color='k', xvalues=data['iters'], points_marker='*', plot_legend_label="P(X = 1 | parent = 0)") y_ser.append(ser) ser = u.YSeries(data['root_node_prob'], line_color='r', xvalues=data['iters'], points_marker='x', plot_legend_label="P(X_root = 1)") y_ser.append(ser) ser = u.YSeries(data['n1'], line_color='g', xvalues=data['iters'], points_marker='o', plot_legend_label="P(X = 1 | parent = 1)") y_ser.append(ser) u.SaveDataPlotWithLegends(y_ser, filename=outputfile, y1_axis_name="probabilities", x_axis_name="iterations")
def Plot(rootfolder, cols_to_plot_dict): data = pd.read_csv(rootfolder + "/stats_agg.csv") sizes = data['size'].unique() algos = ['rhc', 'sa', 'mimic', 'ga'] algo_decoration = { 'mimic': ('r', 'o', 'mimic'), 'ga': ('g', 's', 'genetic algo'), 'sa': ('b', '+', 'sim annealing'), 'rhc': ('k', '*', 'rhc') } for col in cols_to_plot_dict.keys(): y_ser = [] for algo in algos: x = data[data['algo'] == algo].loc[:, 'size'] y = data[data['algo'] == algo].loc[:, col] legend_label = algo_decoration[algo][2] marker = algo_decoration[algo][1] color = algo_decoration[algo][0] yseries = u.YSeries(y, points_marker=marker, line_color=color, xvalues=x, plot_legend_label=legend_label) y_ser.append(yseries) y_axis_name = cols_to_plot_dict[col] x_axis_name = 'size' savepath = u.PreparePath(rootfolder + "/plots/" + col + ".png") u.SaveDataPlotWithLegends(y_ser, filename=savepath, y1_axis_name=y_axis_name, x_axis_name=x_axis_name)
def PlotAdaboostPerIterationCurves(file_template, filter, plot_output_file, iters, y_axis_name='F-Measure'): ts = [20, 30, 40, 50, 60, 70, 80, 90, 100] colors = u.GetColorCombinations() y = [] for _ts in ts: data = pd.read_csv(file_template.format(str(_ts))) data = u.FilterRows(data, filter) data = data.set_index('iter') train_data = FilterRows(data, lambda x: x['istrain'] == 1) test_data = FilterRows(data, lambda x: x['istrain'] == 0) train_y = [] test_y = [] for iter in iters: train_y.append(train_data.loc[iter]['m']) test_y.append(test_data.loc[iter]['m']) c = colors.pop() y.append( u.YSeries(train_y, points_marker='o', line_color=c['color'], plot_legend_label=str(_ts) + "-train")) y.append( u.YSeries(test_y, points_marker='x', line_color=c['color'], plot_legend_label=str(_ts) + "-validation")) u.SaveDataPlotWithLegends(y, iters, plot_output_file, x_axis_name="num of iters/weak learners", y1_axis_name=y_axis_name)
def PlotLossCurvesForNeuralNets(metrics_file, output_file_template): metrics = pd.read_csv(metrics_file) es = [False, True] y = [] for _es in es: colors = u.GetColorCombinations(4) filter = lambda x: x['earlystopping'] == _es data = FilterRows(metrics, filter) train_data = FilterRows( data, lambda x: x['istrain'] == 1).set_index('train_split_percent_used') for label in train_data.index: yvalues = [ float(x) for x in train_data.loc[label]['loss_curve'].split(';') ] xvalues = np.arange(len(yvalues)) + 1 y.append( u.YSeries(yvalues, points_marker='.', legend_marker='o', line_color=colors.pop()['color'], plot_legend_label=str(label), xvalues=xvalues)) filename = output_file_template.format(str(_es)) u.SaveDataPlotWithLegends(y, None, filename, x_axis_name="epochs", y1_axis_name="train loss", x_limits=[1, 200]) y.clear()
def PlotCrossValidationCurves2(dataset_instance_root, plot_output_file, x_axis_name, y_axis_name, title, parameter_value_getter_fn, cv_results_file_getter_fn, cv_save_file=None, should_plot=lambda x: True, label_maker=lambda x: x): grid = ParameterGrid([{ 'marker': ['o', 'x', 'd', '^', '+', 'v', '8', 's', 'p', '>', '<'], 'color': [ 'orange', 'red', 'blue', 'green', 'black', 'saddlebrown', 'violet', 'darkcyan', 'maroon', 'lightcoral' ] }]) combinations = [p for p in grid] random.seed(30) random.shuffle(combinations) param_dict = {} x_value_dict = {} for parameter_value_dataset in u.Get_Subdirectories(dataset_instance_root): cv_file_path = cv_results_file_getter_fn(parameter_value_dataset) if (os.path.isfile(cv_file_path) == False): continue cv_results = pd.read_csv(cv_file_path) parameter_value = parameter_value_getter_fn(parameter_value_dataset) for i in range(len(cv_results)): #param_dict = {param1 : series_1} param = cv_results.iloc[i]['params'] s = pd.Series( {parameter_value: cv_results.iloc[i]['mean_test_score']}) if param in param_dict: param_dict[param] = param_dict[param].append(s) else: param_dict[param] = s yseries = [] x = [] for name, value in param_dict.items(): if (should_plot(name) == False): continue theme = combinations.pop() y = u.YSeries(value.sort_index().values, points_marker=theme['marker'], line_color=theme['color'], plot_legend_label=name) yseries.append(y) x = value.sort_index().index transpose_data = pd.DataFrame(param_dict).transpose() x_values = transpose_data.index.values x_values = list(map(label_maker, x_values)) col = transpose_data.columns[0] y_values = transpose_data[col] yseries = [u.YSeries(y_values)] u.SaveDataPlotWithLegends(yseries, x_values, plot_output_file, True, x_axis_name, y_axis_name) if (cv_save_file is not None): pd.DataFrame(param_dict).transpose().to_csv(cv_save_file)
def PlotClusteringMetrics(rootfolder, data, k,dim='raw',p=2): filter = lambda x : x['dim_red_method'] == dim and x['p'] == p filtered_data = u.FilterRows(data,filter) metrics = ["ami_raw","ami_true","sc","bic"] gmm_data = filtered_data.loc[filtered_data['clustering'] == "gmm",:] kmeans_data = filtered_data.loc[filtered_data['clustering'] == "kmeans",:] d = {"kmeans":('o','b','kmeans'),"gmm":('x','r','gmm')} for metric in metrics: outputfile = u.PreparePath(rootfolder + "/plots/metrics/{0}_{1}_p={2}.png".format(metric,dim,str(p))) kmeans_ser = u.YSeries(kmeans_data[metric],xvalues=kmeans_data["k"],points_marker = d["kmeans"][0],line_color=d["kmeans"][1],plot_legend_label=d["kmeans"][2]) gmm_ser = u.YSeries(gmm_data[metric],xvalues=gmm_data["k"],points_marker = d["gmm"][0],line_color=d["gmm"][1],plot_legend_label=d["gmm"][2]) u.SaveDataPlotWithLegends([kmeans_ser,gmm_ser],x_axis_name="number of clusters",y1_axis_name=metric,filename=outputfile)
def PlotClusteringMetricsForDimsAndBic(rootfolder, data, dims, dim_reds,k,metrics = ["ami_raw","ami_true","sc","bic"]): colors = {"ica":'r','pca':'b','rp':'g','mi':'k','raw':'orange'} markers = {"kmeans":'o',"gmm":'x'} for _k in dims: for metric in metrics: for dim_red in dim_reds: ser = [] outputfile = u.PreparePath(rootfolder + "/plots/metrics/dr_{0}_p={1}_{2}.png".format(metric,str(_k),dim_red)) d = data.loc[(data['dim_red_method'] == dim_red) & (data['p'] == _k) & (data['clustering'] == 'kmeans') ,:] ser.append(u.YSeries(d[metric],xvalues=d['k'],line_color=colors[dim_red],points_marker=markers['kmeans'],plot_legend_label="{0}-{1}".format(dim_red,'kmeans'))) d = data.loc[(data['dim_red_method'] == dim_red) & (data['p'] == _k) & (data['clustering'] == 'gmm') ,:] ser.append(u.YSeries(d[metric],xvalues=d['k'],line_color=colors[dim_red],points_marker=markers['gmm'],plot_legend_label="{0}-{1}".format(dim_red,'gmm'))) u.SaveDataPlotWithLegends(ser,x_axis_name="k",y1_axis_name=metric,filename = outputfile)
def PlotCrossValidationCurvesForWeka(cv_file, model_complexity_param_name, metric_name, plt_output_file, title, x_axis_name, y_axis_name, rows_filter_fn=None): data = pd.read_csv(cv_file) if (rows_filter_fn is not None): data = FilterRows(data, rows_filter_fn) metric_vals = data[[model_complexity_param_name, metric_name ]].set_index(model_complexity_param_name).sort_index() x = metric_vals.index y = metric_vals[metric_name] y = u.YSeries(y) u.SaveDataPlotWithLegends([y], x, plt_output_file, True, x_axis_name, y_axis_name, title)
def TestPlotting(): y1 = u.YSeries(np.arange(10) * 2, line_style='-', points_marker='o', line_color='r', plot_legend_label='x^2') y2 = u.YSeries(np.arange(10), line_style='-', points_marker='x', line_color='b', plot_legend_label='x') x = np.arange(10) fig, ax = u.SaveDataPlotWithLegends([y1, y2], x, r"c:/temp/testfig.png", dispose_fig=False, x_axis_name="x values", y1_axis_name="y values", title="x square") plt.show(fig)
def PlotPerIterationCurves(rootFolder, outputfolder): mimic = pd.read_csv(rootFolder + "/mimic.csv") sa = pd.read_csv(rootFolder + "/sa.csv") rhc = pd.read_csv(rootFolder + "/rhc.csv") ga = pd.read_csv(rootFolder + "/ga.csv") sizes = np.array(mimic['size'].unique()) algo_decoration = { 'mimic': ('r', 'o', 'mimic', mimic), 'ga': ('g', 's', 'genetic algo', ga), 'sa': ('b', '+', 'sim annealing', sa), 'rhc': ('k', '*', 'rhc', rhc) } def f(data, name): x = data['iters'] y = data['fn_value'] deco = algo_decoration[name] return u.YSeries(y, xvalues=x, points_marker='.', plot_legend_label=deco[2], legend_marker='o', line_color=deco[0]) for size in sizes: size_root = u.PreparePath(outputfolder + "/itercurves_" + str(size) + ".png") y_ser = [] for key in algo_decoration.keys(): d = u.FilterRows(algo_decoration[key][3], lambda x: x['size'] == size).head(10000) y_ser.append(f(d, key)) u.SaveDataPlotWithLegends(y_ser, x_axis_name="iters", x=None, y1_axis_name="fn value", filename=size_root)
def PlotTempVariationCurvesForSa(rootfolder, algoname, temperatures): """ """ rhcdata = u.FilterRows( pd.read_csv(rootfolder + "/" + algoname + "/stats_agg.csv"), lambda x: x['algo'] == 'rhc') y_Ser = [] y_Ser.append( u.YSeries(rhcdata['converged_iters'], xvalues=rhcdata['size'], points_marker="*", line_color="k", plot_legend_label="rhc")) data_dict = {} deco = { '0': ("r", "x"), '90': ("b", "o"), '95': ("g", "+"), '99': ("orange", ">") } for t in temperatures: path = rootfolder + "/" + algoname + "_" + t CompteStats(path, ["sa.csv"]) data_dict[t] = pd.read_csv(path + "/stats_agg.csv") y_Ser.append( u.YSeries(data_dict[t]['converged_iters'], xvalues=data_dict[t]['size'], points_marker=deco[t][1], line_color=deco[t][0], plot_legend_label="sa_" + t)) outputfile = rootfolder + "/" + algoname + "/plots/sa_temperatures.png" u.SaveDataPlotWithLegends(y_Ser, y1_axis_name="iterations to converge", x_axis_name="size", filename=outputfile)
def NNetAnalysis(output_root, output_file_prefix, metrics_file, iters_to_ignore, y_axis_name="F-Measure"): data_all = pd.read_csv(metrics_file) dataset_types = ['train_split_percent_used'] col_funcs = { 'p': ['mean'], 'r': ['mean'], 'm': ['mean'], 'modelbuildtimesecs': ['mean'] } mapping_output_words = { 'p': 'Precision', 'r': 'Recall', 'm': y_axis_name, dataset_types[0]: 'Train size % used', 'modelbuildtimesecs': 'Time to build model (sec)' } for dataset_type in dataset_types: def filter_query(x): return (~np.isnan(x[dataset_type]) & (x['total_iter'] > iters_to_ignore)) def train_earlystopping_filter(x): return x['earlystopping'] & (x['istrain'] == 1) def train_no_earlystopping_filter(x): return (x['earlystopping'] == False) & (x['istrain'] == 1) def test_earlystopping_filter(x): return x['earlystopping'] & (x['istrain'] == 0) def test_no_earlystopping_filter(x): return (x['earlystopping'] == False) & (x['istrain'] == 0) data = FilterRows(data_all, filter_query) data_agg = GetAggMetrics( data, col_funcs=col_funcs, gpby=[dataset_type, 'earlystopping', 'istrain']) x = data_agg[dataset_type].unique() def MissingValuesHandler(curr_values_frame, keyCol, valueCol, required_values): data = dict( zip(curr_values_frame[keyCol], curr_values_frame[valueCol])) y = [] for v in required_values: if (v in data): y.append(data[v]) else: y.append(0) return y for k, v in col_funcs.items(): for agg in v: mvh = lambda df: MissingValuesHandler(df, dataset_type, k + "_" + agg, x) y_train_earlystopping = u.YSeries( mvh(FilterRows(data_agg, train_earlystopping_filter)), line_color='r', points_marker='o', plot_legend_label="Train_with_earlystopping") y_train_no_earlystopping = u.YSeries( mvh(FilterRows(data_agg, train_no_earlystopping_filter)), line_color='r', points_marker='x', plot_legend_label="Train_without_earlystopping") y_test_earlystopping = u.YSeries( mvh(FilterRows(data_agg, test_earlystopping_filter)), line_color='b', points_marker='o', plot_legend_label="Validation_with_earlystopping") y_no_test_earlystopping = u.YSeries( mvh(FilterRows(data_agg, test_no_earlystopping_filter)), line_color='b', points_marker='x', plot_legend_label="Validation_without_earlystopping") output_file_name = u.PreparePath( "{3}/{0}.{4}.{1}.{2}.png".format(output_file_prefix, k, agg, output_root, dataset_type)) f, ax = u.SaveDataPlotWithLegends( [ y_test_earlystopping, y_no_test_earlystopping, y_train_no_earlystopping, y_train_earlystopping ], x, output_file_name, True, mapping_output_words[dataset_type], mapping_output_words[k], 'Neural Nets Performance'.format(agg)) return data_agg
def AdaBoostAnalysis(output_root, output_file_prefix, metrics_file): data_all = pd.read_csv(metrics_file) dataset_types = [ 'train_split_percent_used', 'imbalance_perc', 'noise_perc' ] col_funcs = { 'p': ['mean', 'std'], 'r': ['mean', 'std'], 'm': ['mean', 'std'] } mapping_output_words = { 'p': 'Precision', 'r': 'Recall', 'm': 'F-Measure', dataset_types[0]: 'Train size % used', dataset_types[1]: 'Fraction of postives to negatives', dataset_types[2]: 'Noise %', 'modelbuildtimesecs': 'Time to build AdaBoost model (sec)' } for dataset_type in dataset_types: def filter_query(x): return (~np.isnan(x[dataset_type])) data = FilterRows(data_all, filter_query) data_agg = GetAggMetrics( data, col_funcs=col_funcs, gpby=[dataset_type, 'prune', 'istrain', 'iter']) for metric, v in col_funcs.items(): for agg in v: iterations = np.sort(data_agg['iter'].unique()) prune_vals = data_agg['prune'].unique() dataset_type_values = data_agg[dataset_type].unique() for type_val in dataset_type_values: for prune_val in prune_vals: metric_col = metric + "_" + agg y_test = [] y_train = [] for i in iterations: filtered_data = data_agg[ (data_agg['prune'] == prune_val) & (data_agg['iter'] == i) & (data_agg[dataset_type] == type_val)] train_data = filtered_data[filtered_data['istrain'] == 1] assert (len(train_data) == 1) y_train.append(train_data[metric_col].iloc[0]) test_data = filtered_data[filtered_data['istrain'] == 0] assert (len(test_data) == 1) y_test.append(test_data[metric_col].iloc[0]) # now we can plot since we have test and train values for each iter output_file_name = u.PreparePath( "{4}/{0}.{1}.prune-{5}.{6}-{7}.{2}.{3}.png".format( output_file_prefix, dataset_type, metric, agg, output_root, prune_val, dataset_type, type_val)) y_train_series = u.YSeries(y_train, line_color='r', plot_legend_label='train') y_test_series = u.YSeries(y_test, line_color='b', plot_legend_label='test') if (~os.path.isfile(output_file_name)): u.SaveDataPlotWithLegends( [y_train_series, y_test_series], iterations, output_file_name, True, "num of iterations", mapping_output_words[metric], "AdaBoost Performance ({0})".format(agg)) print(output_file_name)
def PlotPiViConvergenceForSmallAndLargeMdp(outputfolder, datafile, gamma): data = pd.read_csv(datafile) decorations = {1: 'g', 10: 'k', 10000: 'r'} pi_sweeps = [1, 10, 10000] ser = [] ser1 = [] vi_added = False for sweep in pi_sweeps: data_vi = u.FilterRows( data, lambda x: (x['mdp'] == 'LargeMdpRwTraps50') & (x['solver'] == 'vi') & (x['gamma'] == gamma)) data_pi = u.FilterRows( data, lambda x: (x['mdp'] == 'LargeMdpRwTraps50') & (x['solver'] == 'pi') & (x['gamma'] == gamma) & (x['maxSweepsPerIteration'] == sweep)) assert (len(data_vi) == 1) assert (len(data_pi) == 1) data_vi_qchange = np.array( [float(s) for s in data_vi.iloc[0]['cum_rewards'].split(';')]) data_vi_value = np.array([ float(s) for s in data_vi.iloc[0]['ran_to_completion'].split(';') ]) data_pi_qchange = np.array( [float(s) for s in data_pi.iloc[0]['cum_rewards'].split(';')]) data_pi_value = np.array([ float(s) for s in data_pi.iloc[0]['ran_to_completion'].split(';') ]) if (vi_added == False): s_vi = u.YSeries(data_vi_qchange, xvalues=np.arange(len(data_vi_qchange)) + 1, line_color='b', plot_legend_label='VI') ser.append(s_vi) s_pi = u.YSeries(data_pi_qchange, xvalues=np.arange(len(data_pi_qchange)) + 1, line_color=decorations[sweep], plot_legend_label='PI_' + str(sweep)) ser.append(s_pi) if (vi_added == False): s_vi = u.YSeries(data_vi_value, xvalues=np.arange(len(data_vi_value)) + 1, line_color='b', plot_legend_label='VI') ser1.append(s_vi) s_pi = u.YSeries(data_pi_value, xvalues=np.arange(len(data_pi_value)) + 1, line_color=decorations[sweep], plot_legend_label='PI_' + str(sweep)) ser1.append(s_pi) vi_added = True outputfile = u.PreparePath(outputfolder + "/plots/large_qchange_gamma=" + str(gamma) + ".png") u.SaveDataPlotWithLegends(ser, filename=outputfile, x_axis_name="iterations", y1_axis_name="Max change in state value") outputfile = u.PreparePath(outputfolder + "/plots/large_value_gamma=" + str(gamma) + ".png") u.SaveDataPlotWithLegends(ser1, filename=outputfile, x_axis_name="iterations", y1_axis_name="Total value accross states") ser = [] ser1 = [] vi_added = False for sweep in pi_sweeps: data_vi = u.FilterRows( data, lambda x: (x['mdp'] == 'SmallMdpRwTraps') & (x['solver'] == 'vi') & (x['gamma'] == gamma)) data_pi = u.FilterRows( data, lambda x: (x['mdp'] == 'SmallMdpRwTraps') & (x['solver'] == 'pi') & (x['gamma'] == gamma) & (x['maxSweepsPerIteration'] == sweep)) assert (len(data_vi) == 1) assert (len(data_pi) == 1) data_vi_qchange = np.array( [float(s) for s in data_vi.iloc[0]['cum_rewards'].split(';')]) data_vi_value = np.array([ float(s) for s in data_vi.iloc[0]['ran_to_completion'].split(';') ]) data_pi_qchange = np.array( [float(s) for s in data_pi.iloc[0]['cum_rewards'].split(';')]) data_pi_value = np.array([ float(s) for s in data_pi.iloc[0]['ran_to_completion'].split(';') ]) if (vi_added == False): s_vi = u.YSeries(data_vi_qchange, xvalues=np.arange(len(data_vi_qchange)) + 1, line_color='b', plot_legend_label='VI') ser.append(s_vi) s_pi = u.YSeries(data_pi_qchange, xvalues=np.arange(len(data_pi_qchange)) + 1, line_color=decorations[sweep], plot_legend_label='PI_' + str(sweep)) ser.append(s_pi) if (vi_added == False): s_vi = u.YSeries(data_vi_value, xvalues=np.arange(len(data_vi_value)) + 1, line_color='b', plot_legend_label='VI') ser1.append(s_vi) s_pi = u.YSeries(data_pi_value, xvalues=np.arange(len(data_pi_value)) + 1, line_color=decorations[sweep], plot_legend_label='PI_' + str(sweep)) ser1.append(s_pi) vi_added = True outputfile = u.PreparePath(outputfolder + "/plots/small_qchange_gamma=" + str(gamma) + ".png") u.SaveDataPlotWithLegends(ser, filename=outputfile, x_axis_name="iterations", y1_axis_name="Max change in state value") outputfile = u.PreparePath(outputfolder + "/plots/small_value_gamma=" + str(gamma) + ".png") u.SaveDataPlotWithLegends(ser1, filename=outputfile, x_axis_name="iterations", y1_axis_name="Total value accross states")
def SvmAnalysis(output_root, output_file_prefix, metrics_file, dataset_filter_fn=None, y_axis_name="F-Measure"): def ComputeTotalSupportVectors(s): return np.array([int(t) for t in s.split(';')]).sum() data_all = pd.read_csv(metrics_file) data_all['numsupportvectors'] = data_all['numsupportvectors'].apply( ComputeTotalSupportVectors) dataset_types = ['train_split_percent_used'] col_funcs = { 'p': ['mean'], 'r': ['mean'], 'm': ['mean'], 'modelbuildtimesecs': ['mean'] } mapping_output_words = { 'p': 'Precision', 'r': 'Recall', 'm': y_axis_name, dataset_types[0]: 'Train size % used', 'modelbuildtimesecs': 'Time to build model (sec)', 'numsupportvectors': 'Number of Support Vectors' } for dataset_type in dataset_types: def filter_query(x): return ~np.isnan(x[dataset_type]) def train_filter(x): return (x['istrain'] == 1) def test_filter(x): return (x['istrain'] == 0) if (dataset_filter_fn is not None): data_all = FilterRows(data_all, dataset_filter_fn) data = FilterRows(data_all, filter_query) data_agg = GetAggMetrics(data, col_funcs=col_funcs, gpby=[dataset_type, 'istrain']) x = data_agg[dataset_type].unique() for k, v in col_funcs.items(): for agg in v: y_train = u.YSeries(FilterRows(data_agg, train_filter)[k + "_" + agg], line_color='r', points_marker='o', plot_legend_label="Train") y_test = u.YSeries(FilterRows(data_agg, test_filter)[k + "_" + agg], line_color='b', points_marker='o', plot_legend_label='validation') if ((k == 'numsupportvectors') | (k == 'modelbuildtimesecs')): y_series = [y_train] else: y_series = [y_test, y_train] output_file_name = u.PreparePath( "{3}/{0}.{4}.{1}.{2}.png".format(output_file_prefix, k, agg, output_root, dataset_type)) f, ax = u.SaveDataPlotWithLegends( y_series, x, output_file_name, True, mapping_output_words[dataset_type], mapping_output_words[k], 'SVM Performance'.format(agg)) return data_agg
def DecisionTreeAnalysis(output_root, output_file_prefix, metrics_file, dataset_filter_fn=None, plt_title="Decision Trees Performance", y_axis_name='F-Measure'): data_all = pd.read_csv(metrics_file) dataset_types = ['train_split_percent_used'] col_funcs = { 'p': ['mean'], 'r': ['mean'], 'm': ['mean'], 'modelbuildtimesecs': ['mean'] } mapping_output_words = { 'p': 'Precision', 'r': 'Recall', 'm': y_axis_name, dataset_types[0]: 'Train size % used', 'modelbuildtimesecs': 'Time to build model (sec)' } for dataset_type in dataset_types: def filter_query(x): return ~np.isnan(x[dataset_type]) def train_prune_filter(x): return x['prune'] & (x['istrain'] == 1) def train_no_prune_filter(x): return (x['prune'] == False) & (x['istrain'] == 1) def test_prune_filter(x): return x['prune'] & (x['istrain'] == 0) def test_no_prune_filter(x): return (x['prune'] == False) & (x['istrain'] == 0) if (dataset_filter_fn is not None): data_all = FilterRows(data_all, dataset_filter_fn) data = FilterRows(data_all, filter_query) data_agg = GetAggMetrics(data, col_funcs=col_funcs, gpby=[dataset_type, 'prune', 'istrain']) x = data_agg[dataset_type].unique() for k, v in col_funcs.items(): for agg in v: y_train_prune = u.YSeries( FilterRows(data_agg, train_prune_filter)[k + "_" + agg], line_color='r', points_marker='o', plot_legend_label="Train_with_pruning") y_train_no_prune = u.YSeries( FilterRows(data_agg, train_no_prune_filter)[k + "_" + agg], line_color='r', points_marker='x', plot_legend_label="Train_without_pruning") y_test_prune = u.YSeries( FilterRows(data_agg, test_prune_filter)[k + "_" + agg], line_color='b', points_marker='o', plot_legend_label="Validation_with_pruning") y_no_test_prune = u.YSeries( FilterRows(data_agg, test_no_prune_filter)[k + "_" + agg], line_color='b', points_marker='x', plot_legend_label="Validation_without_pruning") if (len(y_train_prune.values) == 0): y_no_test_prune.plot_legend_label = "Validation" y_train_no_prune.plot_legend_label = "Train" if ((k == 'modelbuildtimesecs')): y_series = [y_train_no_prune] else: y_series = [y_no_test_prune, y_train_no_prune] else: if ((k == 'modelbuildtimesecs')): y_series = [y_train_no_prune, y_train_prune] else: y_series = [ y_test_prune, y_no_test_prune, y_train_no_prune, y_train_prune ] output_file_name = u.PreparePath( "{3}/{0}.{4}.{1}.{2}.png".format(output_file_prefix, k, agg, output_root, dataset_type)) f, ax = u.SaveDataPlotWithLegends( y_series, x, output_file_name, True, mapping_output_words[dataset_type], mapping_output_words[k], plt_title) return data_agg
def RunExperiments(X,Y,rootfolder,clusters,dims,compute_acc=None): datasets = {} datasets["raw"] = (X,Y) err_series = [] decorations = {} decorations["pca"] = ("o","r","pca") decorations["ica"] = ("x","b","ica") decorations["rp"] = ("+","g","rp") decorations["mi"] = ("o","k","mi") flags = [True,True,True,True] nn_output_lines = [] nn_output_file = rootfolder + "/nn.csv" if(compute_acc is not None): h,l = CreateOutputLineForNN(RunNeuralNetwork(X,Y,10,compute_acc,False),"raw") nn_output_lines.append(h) nn_output_lines.append(l) best_bic = None ################### PCA ##################### if(flags[0]): pca_results = PerformPca(X,Y,dims,0) pca_var_explained_plot = u.PreparePath(rootfolder + "/plots/pca/var.png") recons_err_plot = u.PreparePath(rootfolder + "/plots/err.png") recons_err_dict = [] var_y = [] err_y = [] for dim in dims: key = "pca_{0}_".format(str(dim)) datasets[key] = (DoStandardScalingNumpyArray(pca_results["{0}data".format(key)]),Y) err_y.append(pca_results[key+"reconstruction_error"]) var_y = pca_results[key+"explained_var_ratio"] #if(compute_acc is not None and dim == 2): # h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"pca") # #nn_output_lines.append(h) # nn_output_lines.append(l) ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["pca"][0],line_color=decorations["pca"][1],plot_legend_label=decorations["pca"][2]) recons_err_dict.append(ser) ser = u.YSeries(var_y,xvalues = np.arange(len(var_y)) + 1,points_marker=decorations["pca"][0],line_color=decorations["pca"][1],plot_legend_label=decorations["pca"][2]) u.SaveDataPlotWithLegends([ser],x_axis_name="dimensions",y1_axis_name="% explained variance",filename=pca_var_explained_plot) ################### ICA ##################### if(flags[1]): ica_kt_plot = u.PreparePath(rootfolder + "/plots/ica/kt.png") err_y = [] ica_results = PerformIca(X,Y,dims,0) for dim in dims: key = "ica_{0}_".format(str(dim)) datasets[key] = (DoStandardScalingNumpyArray(ica_results[key+"data"]),Y) err_y.append(ica_results[key+"reconstruction_error"]) #if(compute_acc is not None and dim == 2): # h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"ica") # nn_output_lines.append(l) var_y = ica_results["ica_kt_all"] ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["ica"][0],line_color=decorations["ica"][1],plot_legend_label=decorations["ica"][2]) recons_err_dict.append(ser) ser = u.YSeries(var_y,xvalues = np.arange(len(var_y)) + 1,points_marker=decorations["ica"][0],line_color=decorations["ica"][1],plot_legend_label=decorations["ica"][2]) u.SaveDataPlotWithLegends([ser],x_axis_name="components",y1_axis_name="kurtosis",filename=ica_kt_plot) ################### RP ##################### if(flags[2]): rp_runs_plot = u.PreparePath(rootfolder + "/plots/rp/runs.png") err_y = [] runs = 10 rp_results = PerformRandomProjections(X,Y,dims,runs) runs_series = [] markers = u.GetColorCombinations(10) i=0 for dim in dims: key = "rp_{0}_".format(str(dim)) datasets[key] = (DoStandardScalingNumpyArray(rp_results[key+"data"]),Y) err_y.append(rp_results[key+"reconstruction_error"]) runs_ser = u.YSeries(rp_results[key+"reconstruction_errors_all"],xvalues=np.arange(runs)+1,points_marker = "o",line_color = markers[i]["color"],plot_legend_label="proj dims = "+str(dim)) runs_series.append(runs_ser) i = i + 1 #if(compute_acc is not None and dim == 2): # h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"rp") # nn_output_lines.append(l) ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["rp"][0],line_color=decorations["rp"][1],plot_legend_label=decorations["rp"][2]) recons_err_dict.append(ser) u.SaveDataPlotWithLegends(runs_series,x_axis_name="run number",y1_axis_name="reconstruction err",filename=rp_runs_plot) u.SaveDataPlotWithLegends(recons_err_dict,x_axis_name="dimensions",y1_axis_name="reconstruction_error",filename=recons_err_plot) ###################### MI Feature Selection ######################### if(flags[3]): mi_results = PerformMiBasedFeatureSelection(X,Y,dims,10) mi_plot = u.PreparePath(rootfolder + "/plots/mi/scores.png") for dim in dims: key = "mi_{0}_".format(str(dim)) datasets[key] = (DoStandardScalingNumpyArray(mi_results[key+"data"]),Y) #if(compute_acc is not None and dim == 2): # h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"mi") # nn_output_lines.append(l) ser = u.YSeries(mi_results["scores"],xvalues = np.arange(len(mi_results["scores"])) + 1,points_marker=decorations["mi"][0],line_color=decorations["mi"][1],plot_legend_label=decorations["mi"][2]) u.SaveDataPlotWithLegends([ser],x_axis_name="feature number",y1_axis_name="mutual information", filename=mi_plot) ###################### CLUSTERING ######################### clustering_output_file = rootfolder + "/clustering.csv" clustering_plots_output_root = u.PreparePath(rootfolder + "/plots") lines = [] lines.append("clustering,dim_red_method,k,p,ami_raw,ami_true,sc,bic") raw_clustering_results = {} best_bic_raw_clustering = {} curr_best_bic = {} actual_labels = Y for dim in dims: for algo in ["raw","ica","rp","mi","pca"]: raw_data_plot_done = False key = "{0}_{1}_".format(algo,str(dim)) if(algo == "raw"): key = "raw" dataset = datasets[key] for cluster in clusters: for mthd in ["kmeans","gmm"]: raw_key = "{0}_{1}".format(str(cluster),mthd) print("doing clustering for dim = {0} {1} k = {2} {3}".format(str(dim),algo,str(cluster), mthd)) c_key = "{0}_{1}_predicted".format(mthd,str(cluster)) c_key1 = "{0}_{1}_".format(mthd,str(cluster)) if(algo == "raw" and raw_key in raw_clustering_results): results = raw_clustering_results[raw_key] else: #if(algo == "raw" and cluster == 2 and compute_acc): # results = RunClustering(dataset[0],dataset[1],[cluster],0,[mthd],dim)[mthd] # h,l = CreateOutputLineForNN(RunNeuralNetwork(results[c_key.replace("predicted","new_data")],dataset[1],10,compute_acc),mthd) # nn_output_lines.append(l) #else: results = RunClustering(dataset[0],dataset[1],[cluster],0,[mthd],dim)[mthd] if(algo == "raw"): raw_clustering_results[raw_key] = results if(compute_acc): mthd_key = mthd+algo if algo == "raw" else mthd+algo+str(cluster)+str(dim) if((mthd_key not in curr_best_bic) or (curr_best_bic[mthd_key] > results[c_key1+"bic"])): curr_best_bic[mthd_key] = results[c_key1+"bic"] best_bic_raw_clustering[mthd_key] = (results[c_key1+"new_data"],dataset[1],results[c_key1+"metrics"]["ami"],results[c_key1+"bic"]) print("new best {0} {1}".format(c_key1,str(results[c_key1+"bic"]))) clustering_prediction_file = u.PreparePath(rootfolder + "/clustering_output/mthd={0}_k={1}_d={2}_algo={3}.csv".format(mthd,str(cluster),str(dim),algo)) np.savetxt(clustering_prediction_file,results[c_key]) bic = c_key.replace("predicted","bic") bic = results[bic] act = ComputeClusteringMetrics(actual_labels,results[c_key],dataset[0]) raw = ComputeClusteringMetrics(raw_clustering_results[raw_key][c_key],results[c_key],dataset[0]) line = "{0},{1},{2},{3},{4},{5},{6},{7}".format(mthd,algo,str(cluster),str(dim),str(raw["ami"]),str(act["ami"]),str(raw["sl"]),str(bic)) print(line) plot_output_file = clustering_plots_output_root + "/{0}_{1}_{2}_{3}.png".format(mthd,str(cluster),algo,str(dim)) #if(mthd == "gmm"): # prob_output_file = rootfolder + "/{0}_{1}_{2}_{3}.csv".format(mthd,str(cluster),algo,str(dim)) # np.savetxt(prob_output_file,results[c_key.replace("predicted","prob")],delimiter=",") ScatterPlotForClustering(results[c_key],actual_labels,plot_output_file) if(dim == 2 and algo != "raw"): if(raw_data_plot_done == False): plot_output_file = clustering_plots_output_root + "/{0}_{1}_data.png".format(mthd,algo) ScatterPlotForClusteringData(dataset[0][:,0],dataset[0][:,1],np.zeros_like(actual_labels),actual_labels,plot_output_file) raw_data_plot_done = True plot_output_file = clustering_plots_output_root + "/{0}_{1}_{2}_data.png".format(mthd,str(cluster),algo) ScatterPlotForClusteringData(dataset[0][:,0],dataset[0][:,1],results[c_key],actual_labels,plot_output_file) lines.append(line) #if(compute_acc): # keys_to_output = {"kmeansraw":"kmeans","gmmraw":"gmm","gmmpca":"pca","gmmica":"ica","gmmrp":"rp","gmmmi":"mi"} # for key in keys_to_output.keys(): # if("raw" not in key): # curr_best = None # for cluster in clusters: # datakey = key+str(cluster) # if(curr_best is None or best_bic_raw_clustering[datakey][2] > curr_best): # curr_best = best_bic_raw_clustering[datakey][2] # _X = best_bic_raw_clustering[datakey][0] # _Y = best_bic_raw_clustering[datakey][1] # else: # _X = best_bic_raw_clustering[key][0] # _Y = best_bic_raw_clustering[key][1] # h,l = CreateOutputLineForNN(RunNeuralNetwork(_X,_Y,10,compute_acc,scale=False if "gmmraw" == key else True),keys_to_output[key]) # nn_output_lines.append(l) # u.WriteTextArrayToFile(nn_output_file,nn_output_lines) if(compute_acc): keys_to_output = {"kmeansraw":"kmeans","gmmraw":"gmm","pca":"pca","ica":"ica","rp":"rp","mi":"mi"} for key in keys_to_output.keys(): if("raw" not in key): dim_best_val = None dim_result = None for dim in dims: best = {} # {x,y,p,k,bic,ami} for cluster_mthd in ["kmeans","gmm"]: for cluster in clusters: datakey = cluster_mthd+key+str(cluster)+str(dim) if(cluster_mthd not in best or best_bic_raw_clustering[datakey][2] > best[cluster_mthd][4]): best[cluster_mthd] = (best_bic_raw_clustering[datakey][0],best_bic_raw_clustering[datakey][1],dim,cluster,best_bic_raw_clustering[datakey][3],best_bic_raw_clustering[datakey][2]) curr_val = (best["kmeans"][5] + best["gmm"][5]) / 2 if(dim_best_val is None or dim_best_val < curr_val): dim_best_val = curr_val dim_result = best _X = dim_result["gmm"][0] _Y = dim_result["gmm"][1] else: _X = best_bic_raw_clustering[key][0] _Y = best_bic_raw_clustering[key][1] h,l = CreateOutputLineForNN(RunNeuralNetwork(_X,_Y,10,compute_acc,scale=False if "gmmraw" == key else True),keys_to_output[key]) nn_output_lines.append(l) u.WriteTextArrayToFile(nn_output_file,nn_output_lines) u.WriteTextArrayToFile(clustering_output_file,lines)
def KnnAnalysis(output_root, output_file_prefix, metrics_file): data_all = pd.read_csv(metrics_file) dataset_types = ['train_split_percent_used'] col_funcs = { 'p': ['mean'], 'r': ['mean'], 'm': ['mean'], 'modelevaltimesecs': ['mean'] } mapping_output_words = { 'p': 'Precision', 'r': 'Recall', 'm': 'F-Measure', dataset_types[0]: 'Train size % used', dataset_types[1]: 'Fraction of postives to negatives', dataset_types[2]: 'Noise %', 'modelevaltimesecs': 'Time to run Knn model (sec)' } for dataset_type in dataset_types: def filter_query(x): return (~np.isnan(x[dataset_type]) & (x['istrain'] == 0)) def distance_weights_filter(x): return x['weights'] == 'distance' def uniform_weights_filter(x): return x['weights'] == 'uniform' data = FilterRows(data_all, filter_query) data_agg = GetAggMetrics(data, col_funcs=col_funcs, gpby=[dataset_type, 'weights', 'neighbors']) x = data_agg[dataset_type].unique() for k, v in col_funcs.items(): for agg in v: data_for_distance_based_weighting = FilterRows( data_agg, distance_weights_filter) nneighbors = [5, 10, 20, 50] marker_and_color_map = { 5: ('g', 'o'), 10: ('r', '+'), 20: ('b', 'x'), 50: ('k', 'd') } y_series = [] for n in nneighbors: d = data_for_distance_based_weighting[ data_for_distance_based_weighting['neighbors'] == n] y = u.YSeries(d[k + "_" + agg], line_color=marker_and_color_map[n][0], points_marker=marker_and_color_map[n][1], plot_legend_label="k = " + str(n)) y_series.append(y) output_file_name = u.PreparePath( "{4}/{0}.{1}.weighted.{2}.{3}.png".format( output_file_prefix, dataset_type, k, agg, output_root)) f, ax = u.SaveDataPlotWithLegends( y_series, x, output_file_name, True, mapping_output_words[dataset_type], mapping_output_words[k], 'K Nearest Neighbor'.format(agg)) data_for_distance_based_weighting = FilterRows( data_agg, uniform_weights_filter) y_series = [] for n in nneighbors: d = data_for_distance_based_weighting[ data_for_distance_based_weighting['neighbors'] == n] y = u.YSeries(d[k + "_" + agg], line_color=marker_and_color_map[n][0], points_marker=marker_and_color_map[n][1], plot_legend_label="k = " + str(n)) y_series.append(y) output_file_name = u.PreparePath( "{4}/{0}.{1}.uniform.{2}.{3}.png".format( output_file_prefix, dataset_type, k, agg, output_root)) f, ax = u.SaveDataPlotWithLegends( y_series, x, output_file_name, True, mapping_output_words[dataset_type], mapping_output_words[k], 'K Nearest Neighbor'.format(agg)) return data_agg
def NeuralNetworkResults(rootfolder): data, ga = ReadNNetResultsFile10k(rootfolder) ga = ga.set_index('size') #data.loc[:,data.columns != 'loss'].to_csv(r'c:\temp\nnets10knew.csv') #ga.loc[:,data.columns != 'loss'].to_csv(r'c:\temp\nnets_ga.csv') algos = ['ga', 'rhc', 'sa', 'bp'] algo_decoration = { 'bp': ('r', 'o', 'backprop'), 'ga': ('g', 's', 'genetic algo'), 'sa': ('b', '+', 'sim annealing'), 'rhc': ('k', '*', 'rhc') } y_ser = [] time_y_ser = [] loss_y_ser = [] size_for_loss_curves = {20: [], 90: [], 100: []} size_for_loss_ga = {20: [], 50: [], 100: []} for algo in algos: filtered_data = data[data['algo'] == algo].set_index('size') train_ser = [] valid_ser = [] x = [] time = [] for size in [20, 30, 40, 50, 60, 70, 80, 90, 100]: x.append(size) train_ser.append(filtered_data.loc[size]['train_f1']) valid_ser.append(filtered_data.loc[size]['valid_f1']) time.append(filtered_data.loc[size]['time']) if (size in size_for_loss_curves): y_vals = np.array(filtered_data.loc[size]['loss'].split(';'), dtype=float) x_vals = np.arange(y_vals.size) + 1 _ser = u.YSeries(y_vals, xvalues=x_vals, line_color=algo_decoration[algo][0], points_marker='.', legend_marker='o', plot_legend_label=algo_decoration[algo][2]) size_for_loss_curves[size].append(_ser) if (algo == "ga" and size in size_for_loss_ga): ga_y_vals = np.array( filtered_data.loc[size]['loss'].split(';'), dtype=float) bad_ga_y_vals = np.array(ga.loc[size]['loss'].split(';'), dtype=float)[0:10000] _ser = u.YSeries(ga_y_vals, xvalues=np.arange(ga_y_vals.size) + 1, line_color='b', points_marker='.', legend_marker='o', plot_legend_label='tournament selection') size_for_loss_ga[size].append(_ser) _ser = u.YSeries(bad_ga_y_vals, xvalues=np.arange(ga_y_vals.size) + 1, line_color='r', points_marker='.', legend_marker='o', plot_legend_label='roulette wheel') size_for_loss_ga[size].append(_ser) y_ser.append( u.YSeries(train_ser, xvalues=x, line_color=algo_decoration[algo][0], points_marker='x', plot_legend_label=algo_decoration[algo][2] + "-train")) y_ser.append( u.YSeries(valid_ser, xvalues=x, line_color=algo_decoration[algo][0], points_marker='o', plot_legend_label=algo_decoration[algo][2] + "-valid")) time_y_ser.append( u.YSeries(time, xvalues=x, line_color=algo_decoration[algo][0], points_marker=algo_decoration[algo][1], plot_legend_label=algo_decoration[algo][2])) x_axis_name = 'trainset size %' y_axis_name = 'f-measure' plot_file = u.PreparePath(rootfolder + "/plot10k/learning_curves.png") time_plot_file = u.PreparePath(rootfolder + "/plot10k/time.png") u.SaveDataPlotWithLegends(y_ser, filename=plot_file, x_axis_name=x_axis_name, y1_axis_name=y_axis_name) u.SaveDataPlotWithLegends(time_y_ser, filename=time_plot_file, x_axis_name=x_axis_name, y1_axis_name="Time (MilliSec)") for key in size_for_loss_curves.keys(): loss_plot_file = u.PreparePath( rootfolder + "/plot10k/loss_curves_{0}.png".format(str(key))) u.SaveDataPlotWithLegends(size_for_loss_curves[key], filename=loss_plot_file, title="Size % : " + str(key), x_axis_name='iters', y1_axis_name="Loss") for key in size_for_loss_ga.keys(): loss_plot_file = u.PreparePath( rootfolder + "/plot10k/loss_curves_ga_{0}.png".format(str(key))) u.SaveDataPlotWithLegends(size_for_loss_ga[key], filename=loss_plot_file, title="Size % : " + str(key), x_axis_name='iters', y1_axis_name="Loss")