Python FilterRows 예제들, utils.FilterRows Python 예제들

예제 #1

0

파일 보기

파일: RandomisedAlgorithmsAnalysis.py 프로젝트: shwetabh-khanduja/ml

def ReadNNetResultsFile10k(rootpath):
    file = rootpath + "/nnet.output.csv"
    ga_noprogressfile = rootpath + "/output_test_lr_10k-iters_30-hiddenlayers_crowding.csv"
    data = pd.read_csv(file)
    ga_noprogress = pd.read_csv(ga_noprogressfile)
    return data, ga_noprogress

    ga_noprogress = u.FilterRows(data, lambda x: x['algo'] == 'ga')
    data = u.FilterRows(data, lambda x: x['algo'] != 'ga')
    ga_progress = pd.read_csv(
        rootpath + "/output_test_lr_10k-iters_30-hiddenlayers_ga.csv")
    data = pd.concat([data, ga_progress], ignore_index=True)
    return data, ga_noprogress

예제 #2

0

파일 보기

def PlotAdaboostPerIterationCurves(file_template,
                                   filter,
                                   plot_output_file,
                                   iters,
                                   y_axis_name='F-Measure'):
    ts = [20, 30, 40, 50, 60, 70, 80, 90, 100]
    colors = u.GetColorCombinations()
    y = []
    for _ts in ts:
        data = pd.read_csv(file_template.format(str(_ts)))
        data = u.FilterRows(data, filter)
        data = data.set_index('iter')
        train_data = FilterRows(data, lambda x: x['istrain'] == 1)
        test_data = FilterRows(data, lambda x: x['istrain'] == 0)
        train_y = []
        test_y = []
        for iter in iters:
            train_y.append(train_data.loc[iter]['m'])
            test_y.append(test_data.loc[iter]['m'])
        c = colors.pop()
        y.append(
            u.YSeries(train_y,
                      points_marker='o',
                      line_color=c['color'],
                      plot_legend_label=str(_ts) + "-train"))
        y.append(
            u.YSeries(test_y,
                      points_marker='x',
                      line_color=c['color'],
                      plot_legend_label=str(_ts) + "-validation"))
    u.SaveDataPlotWithLegends(y,
                              iters,
                              plot_output_file,
                              x_axis_name="num of iters/weak learners",
                              y1_axis_name=y_axis_name)

예제 #3

0

파일 보기

파일: UnsupervisedLearningMain.py 프로젝트: shwetabh-khanduja/ml

def ComputeFinalResults(rootfolder):
    clustering_stats = pd.read_csv(rootfolder+"/clustering.csv")
    final_output_file = rootfolder+"/best_metrics.csv"
    data = u.FilterRows(clustering_stats, lambda x : x['p'] == 2)
    dim_red = ["ica","pca","rp","mi"]
    clustering = ["kmeans","gmm"]
    lines = []
    lines.append("clustering,dim_red,k,p,ami_raw,ami_true,sc,bic")
    raw_predictions = {}
    for c in clustering:
        d = data.loc[(data['dim_red_method'] == "raw") & (data['clustering'] == c),:]
        d = d.loc[d['bic'] == np.min(d['bic']),:]
        clusters_file = rootfolder + "/clustering_output/mthd={0}_k={1}_d=2_algo=raw.csv".format(c,d.iloc[0]['k'])
        raw_predictions[c] = np.loadtxt(clusters_file,delimiter=',')

    for dr in dim_red:
        for c in clustering:
            d = data.loc[(data['dim_red_method'] == dr) & (data['clustering'] == c),:]
            d = d.loc[d['bic'] == np.min(d['bic']),:]
            clusters_file = rootfolder + "/clustering_output/mthd={0}_k={1}_d=2_algo={2}.csv".format(c,d.iloc[0]['k'],dr)
            predicted = np.loadtxt(clusters_file,delimiter=',')
            ami = metrics.adjusted_mutual_info_score(raw_predictions[c],predicted)
            lines.append(u.ConcatToStr(",",[c,dr,d.iloc[0]['k'],ami,d.iloc[0]['ami_true'],d.iloc[0]['sc'],d.iloc[0]['bic']]))

    u.WriteTextArrayToFile(final_output_file,lines)

예제 #4

0

파일 보기

def PlotAvgRewardsPerEpisode(data,
                             totalpoints,
                             points_to_sample,
                             outputfile,
                             y_axis_name,
                             key_to_plot,
                             max_points=100000):
    """
    cr : cum_rewards
    ar : avg_rewards
    len : episode_len
    goal : reached_goal
    """
    data_to_plot = u.FilterRows(
        data, lambda x: (x['solver'] == 'q') & (x['gamma'] == 0.99) &
        (x['alpha'] == 1) & (x['maxInnerVi'] == totalpoints))
    x_to_take = np.arange(totalpoints)
    x_to_take = x_to_take[x_to_take % points_to_sample == 0]
    x_to_take = x_to_take[x_to_take < max_points]
    ser = data_to_plot.apply(
        lambda x: GetQRewardSeriesToPlot(x, x_to_take, key_to_plot), axis=1)
    u.SaveDataPlotWithLegends(ser.values,
                              filename=outputfile,
                              x_axis_name="episodes",
                              y1_axis_name=y_axis_name)

예제 #5

0

파일 보기

파일: UnsupervisedLearningMain.py 프로젝트: shwetabh-khanduja/ml

def PlotClusteringMetrics(rootfolder, data, k,dim='raw',p=2):
    filter = lambda x : x['dim_red_method'] == dim and x['p'] == p
    filtered_data = u.FilterRows(data,filter)
    metrics = ["ami_raw","ami_true","sc","bic"]
    gmm_data = filtered_data.loc[filtered_data['clustering'] == "gmm",:]
    kmeans_data = filtered_data.loc[filtered_data['clustering'] == "kmeans",:]
    d = {"kmeans":('o','b','kmeans'),"gmm":('x','r','gmm')}
    for metric in metrics:
        outputfile = u.PreparePath(rootfolder + "/plots/metrics/{0}_{1}_p={2}.png".format(metric,dim,str(p)))
        kmeans_ser = u.YSeries(kmeans_data[metric],xvalues=kmeans_data["k"],points_marker = d["kmeans"][0],line_color=d["kmeans"][1],plot_legend_label=d["kmeans"][2])
        gmm_ser = u.YSeries(gmm_data[metric],xvalues=gmm_data["k"],points_marker = d["gmm"][0],line_color=d["gmm"][1],plot_legend_label=d["gmm"][2])
        u.SaveDataPlotWithLegends([kmeans_ser,gmm_ser],x_axis_name="number of clusters",y1_axis_name=metric,filename=outputfile)

예제 #6

0

파일 보기

파일: UnsupervisedLearningMain.py 프로젝트: shwetabh-khanduja/ml

def GetBestRawClustering(rootfolder):
    clustering_stats = pd.read_csv(rootfolder+"/clustering.csv")
    final_output_file = rootfolder+"/best_metrics.csv"
    data = u.FilterRows(clustering_stats, lambda x : x['p'] == 2)
    dim_red = ["ica","pca","rp","mi"]
    clustering = ["kmeans","gmm"]
    lines = []
    lines.append("clustering,dim_red,k,p,ami_raw,ami_true,sc,bic")
    raw_predictions = {}
    for c in clustering:
        d = data.loc[(data['dim_red_method'] == "raw") & (data['clustering'] == c),:]
        d = d.loc[d['bic'] == np.min(d['bic']),:]
        clusters_file = rootfolder + "/clustering_output/mthd={0}_k={1}_d=2_algo=raw.csv".format(c,d.iloc[0]['k'])
        raw_predictions[c] = (np.loadtxt(clusters_file,delimiter=','),d.copy())
    return raw_predictions

예제 #7

0

파일 보기

파일: UnsupervisedLearningMain.py 프로젝트: shwetabh-khanduja/ml

def PlotClusteringMetricsForDimsAndDimRed(rootfolder, data, dims, dim_reds,k):
    colors = {"ica":'r','pca':'b','rp':'g','mi':'k','raw':'orange'}
    markers = {"kmeans":'o',"gmm":'x'}
    metrics = ["ami_raw","ami_true","sc","bic"]
    for _k in k:
        filter = lambda x : x['k'] == _k
        filtered_data = u.FilterRows(data,filter)
        for metric in metrics:
            ser = []
            outputfile = u.PreparePath(rootfolder + "/plots/metrics/dr_{0}_k={1}.png".format(metric,str(_k)))
            for dim_red in dim_reds:
                d = data.loc[(data['dim_red_method'] == dim_red) & (data['k'] == _k) & (data['clustering'] == 'kmeans') ,:]
                ser.append(u.YSeries(d[metric],xvalues=d['p'],line_color=colors[dim_red],points_marker=markers['kmeans'],plot_legend_label="{0}-{1}".format(dim_red,'kmeans')))
                d = data.loc[(data['dim_red_method'] == dim_red) & (data['k'] == _k) & (data['clustering'] == 'gmm') ,:]
                ser.append(u.YSeries(d[metric],xvalues=d['p'],line_color=colors[dim_red],points_marker=markers['gmm'],plot_legend_label="{0}-{1}".format(dim_red,'gmm')))
            u.SaveDataPlotWithLegends(ser,x_axis_name="dimensions",y1_axis_name=metric,filename = outputfile)

예제 #8

0

파일 보기

파일: UnsupervisedLearningMain.py 프로젝트: shwetabh-khanduja/ml

def ComputeFinalResults1(rootfolder,clusters,dims):
    raw_results = GetBestRawClustering(rootfolder)
    clustering_results = pd.read_csv(rootfolder+"/clustering.csv")
    final_output_file = rootfolder+"/best_metrics.csv"
    best_raw_clustering_output = rootfolder+"/best_raw_clustering.csv"
    o = pd.concat([raw_results["kmeans"][1], raw_results["gmm"][1]])
    o.to_csv(best_raw_clustering_output)
    dim_red = ["mi","pca","ica","rp"]
    lines = []
    lines.append("clustering,dim_red,k,p,ami_raw,ami_true,sc,bic")
    raw_predictions = {}

    output = None
    for dr in dim_red:
        data = u.FilterRows(clustering_results,lambda x : x["dim_red_method"] == dr)
        dim_best_val = None
        dim_result = None
        for dim in dims:
            best = {} # {p,k,ami}
            for cluster_mthd in ["kmeans","gmm"]:
                for cluster in clusters:
                    print("{0},{1},{2},{3}".format(dr,str(dim),cluster_mthd,str(cluster)))
                    d = data.loc[(data['clustering'] == cluster_mthd)&(data['k'] == cluster) & (data['p'] == dim)]
                    row = d.head(1).copy()
                    if(cluster_mthd not in best or best[cluster_mthd]['bic'].iloc[0] > row['bic'].iloc[0]):
                        best[cluster_mthd] = row
            curr_val = (best["kmeans"]['ami_true'].iloc[0] + best["gmm"]['ami_true'].iloc[0]) / 2
            #curr_val = (best["kmeans"]['ami_true'].iloc[0] + best["gmm"]['ami_true'].iloc[0]) / 2
            #curr_val = np.minimum(best["kmeans"]['ami_true'].iloc[0], best["gmm"]['ami_true'].iloc[0])
            if(dim_best_val is None or dim_best_val < curr_val):
                dim_best_val = curr_val
                dim_result = best.copy()
        for c in ["kmeans","gmm"]:
            ami_raw = GetAmiWithRawPredictions(rootfolder,raw_results,dr,dim_result[c].iloc[0]["p"],dim_result[c].iloc[0]["k"],c)
            lines.append("{0},{1},{2},{3},{4},{5},{6},{7}".format(c,str(dim_result[c].iloc[0]["dim_red_method"]),str(dim_result[c].iloc[0]["k"]),str(dim_result[c].iloc[0]["p"]),str(ami_raw[c]),str(dim_result[c].iloc[0]["ami_true"]),str(dim_result[c].iloc[0]["sc"]),str(dim_result[c].iloc[0]["bic"])))
        #if(output is None):
        #    output = pd.concat([dim_result["kmeans"],dim_result["gmm"]])
        #else:
        #    output = pd.concat([output,dim_result["kmeans"],dim_result["gmm"]])

    u.WriteTextArrayToFile(final_output_file,lines)

예제 #9

0

파일 보기

파일: RandomisedAlgorithmsAnalysis.py 프로젝트: shwetabh-khanduja/ml

def PlotPerIterationCurves(rootFolder, outputfolder):
    mimic = pd.read_csv(rootFolder + "/mimic.csv")
    sa = pd.read_csv(rootFolder + "/sa.csv")
    rhc = pd.read_csv(rootFolder + "/rhc.csv")
    ga = pd.read_csv(rootFolder + "/ga.csv")
    sizes = np.array(mimic['size'].unique())
    algo_decoration = {
        'mimic': ('r', 'o', 'mimic', mimic),
        'ga': ('g', 's', 'genetic algo', ga),
        'sa': ('b', '+', 'sim annealing', sa),
        'rhc': ('k', '*', 'rhc', rhc)
    }

    def f(data, name):
        x = data['iters']
        y = data['fn_value']
        deco = algo_decoration[name]
        return u.YSeries(y,
                         xvalues=x,
                         points_marker='.',
                         plot_legend_label=deco[2],
                         legend_marker='o',
                         line_color=deco[0])

    for size in sizes:
        size_root = u.PreparePath(outputfolder + "/itercurves_" + str(size) +
                                  ".png")
        y_ser = []
        for key in algo_decoration.keys():
            d = u.FilterRows(algo_decoration[key][3],
                             lambda x: x['size'] == size).head(10000)
            y_ser.append(f(d, key))
        u.SaveDataPlotWithLegends(y_ser,
                                  x_axis_name="iters",
                                  x=None,
                                  y1_axis_name="fn value",
                                  filename=size_root)

예제 #10

0

파일 보기

파일: RandomisedAlgorithmsAnalysis.py 프로젝트: shwetabh-khanduja/ml

def PlotTempVariationCurvesForSa(rootfolder, algoname, temperatures):
    """

    """
    rhcdata = u.FilterRows(
        pd.read_csv(rootfolder + "/" + algoname + "/stats_agg.csv"),
        lambda x: x['algo'] == 'rhc')
    y_Ser = []
    y_Ser.append(
        u.YSeries(rhcdata['converged_iters'],
                  xvalues=rhcdata['size'],
                  points_marker="*",
                  line_color="k",
                  plot_legend_label="rhc"))
    data_dict = {}
    deco = {
        '0': ("r", "x"),
        '90': ("b", "o"),
        '95': ("g", "+"),
        '99': ("orange", ">")
    }
    for t in temperatures:
        path = rootfolder + "/" + algoname + "_" + t
        CompteStats(path, ["sa.csv"])
        data_dict[t] = pd.read_csv(path + "/stats_agg.csv")
        y_Ser.append(
            u.YSeries(data_dict[t]['converged_iters'],
                      xvalues=data_dict[t]['size'],
                      points_marker=deco[t][1],
                      line_color=deco[t][0],
                      plot_legend_label="sa_" + t))
    outputfile = rootfolder + "/" + algoname + "/plots/sa_temperatures.png"
    u.SaveDataPlotWithLegends(y_Ser,
                              y1_axis_name="iterations to converge",
                              x_axis_name="size",
                              filename=outputfile)

예제 #11

0

파일 보기

파일: DecisionTreesWithCV.py 프로젝트: shwetabh-khanduja/ml

def RunDecisionTreesWithOptimalInst(datasets_root_folder,
                                    weka_jar_path,
                                    cv_results_file,
                                    use_arff_files=True):
    file_extn = "arff" if use_arff_files else ".csv"
    testfiles = glob.glob("{0}/*.test.{1}".format(datasets_root_folder,
                                                  file_extn))
    cv_results = pd.read_csv(datasets_root_folder + "/" + cv_results_file)
    for dataset_dir in u.Get_Subdirectories(datasets_root_folder):
        trainfile = glob.glob("{0}/*.train.{1}".format(dataset_dir,
                                                       file_extn))[0]
        paramfile = glob.glob("{0}/*.params.txt".format(dataset_dir))[0]
        dt_root = u.PreparePath(dataset_dir + "/dt", is_file=False)
        filter_name, filter_val = GetFilterOptions(dataset_dir)
        config_gen = ParameterGrid({'prune': [True, False]})
        for config in config_gen:

            filter = lambda x: (x['prune'] == False) & (x[
                filter_name] == filter_val) & (x[
                    'istrain'] == 1)  # this will output on the held out set
            filtered_rows = u.FilterRows(cv_results, filter)
            a = filtered_rows['m']
            if (len(a) == 0):
                print("ignoring : {0}".format(dataset_dir))
                continue
            b = np.max(filtered_rows['m'])
            indxs = np.isclose(a, b)
            best_insts = filtered_rows[indxs]
            best_insts = best_insts.iloc[0]['inst']
            config['inst'] = best_insts

            id = GetIdForOptConfig(config)
            params_info = u.ReadLinesFromFile(paramfile)
            params_info_dict = sl.GetDictionary(params_info)
            run_output_dir = u.PreparePath("{0}/{1}".format(dt_root, id),
                                           is_file=False)
            params_output_file = u.PreparePath("{0}/{1}.params.txt".format(
                run_output_dir, id))
            model_output_file = u.PreparePath("{0}/{1}.model".format(
                run_output_dir, id))
            train_output_file = u.PreparePath(
                "{0}/{1}.train.predictions.csv".format(run_output_dir, id))
            test_output_file = u.PreparePath(
                "{0}/{1}.test.predictions.csv".format(run_output_dir, id))
            # if(os.path.isfile(train_output_file)):
            #     continue
            config['random_state'] = params_info_dict['random_state']
            config["wekajar"] = weka_jar_path
            config["trainset"] = trainfile
            config["class"] = "last"
            config["trainpredictionoutputfile"] = train_output_file
            config["predictionoutputfile"] = config[
                "trainpredictionoutputfile"]
            config["modeloutputfile"] = model_output_file
            config["testpredictionoutputfile"] = test_output_file

            # for every config there has to be a train prediction and test prediction
            cmd = GetWekaCommandLineForConfig(config, False, False)
            config["modelbuildtimesecs"] = timeit.timeit(
                lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1)

            # now for test set
            config["predictionoutputfile"] = test_output_file
            config["testset"] = testfiles[0]
            cmd = GetWekaCommandLineForConfig(config, True, False)
            config["modelevaltimesecs"] = timeit.timeit(
                lambda: sl.RunCmdWithoutConsoleWindow(cmd), number=1)

            config.pop('random_state',
                       None)  # since we already have that in params_info
            for k in config:
                params_info.append("{0}={1}".format(k, config[k]))
            u.WriteTextArrayToFile(params_output_file, params_info)
        print("done dataset : " + dataset_dir)

예제 #12

0

파일 보기

def PlotPiViConvergenceForSmallAndLargeMdp(outputfolder, datafile, gamma):
    data = pd.read_csv(datafile)
    decorations = {1: 'g', 10: 'k', 10000: 'r'}
    pi_sweeps = [1, 10, 10000]
    ser = []
    ser1 = []
    vi_added = False
    for sweep in pi_sweeps:
        data_vi = u.FilterRows(
            data, lambda x: (x['mdp'] == 'LargeMdpRwTraps50') &
            (x['solver'] == 'vi') & (x['gamma'] == gamma))
        data_pi = u.FilterRows(
            data, lambda x:
            (x['mdp'] == 'LargeMdpRwTraps50') & (x['solver'] == 'pi') &
            (x['gamma'] == gamma) & (x['maxSweepsPerIteration'] == sweep))
        assert (len(data_vi) == 1)
        assert (len(data_pi) == 1)

        data_vi_qchange = np.array(
            [float(s) for s in data_vi.iloc[0]['cum_rewards'].split(';')])
        data_vi_value = np.array([
            float(s) for s in data_vi.iloc[0]['ran_to_completion'].split(';')
        ])
        data_pi_qchange = np.array(
            [float(s) for s in data_pi.iloc[0]['cum_rewards'].split(';')])
        data_pi_value = np.array([
            float(s) for s in data_pi.iloc[0]['ran_to_completion'].split(';')
        ])
        if (vi_added == False):
            s_vi = u.YSeries(data_vi_qchange,
                             xvalues=np.arange(len(data_vi_qchange)) + 1,
                             line_color='b',
                             plot_legend_label='VI')
            ser.append(s_vi)
        s_pi = u.YSeries(data_pi_qchange,
                         xvalues=np.arange(len(data_pi_qchange)) + 1,
                         line_color=decorations[sweep],
                         plot_legend_label='PI_' + str(sweep))
        ser.append(s_pi)

        if (vi_added == False):
            s_vi = u.YSeries(data_vi_value,
                             xvalues=np.arange(len(data_vi_value)) + 1,
                             line_color='b',
                             plot_legend_label='VI')
            ser1.append(s_vi)
        s_pi = u.YSeries(data_pi_value,
                         xvalues=np.arange(len(data_pi_value)) + 1,
                         line_color=decorations[sweep],
                         plot_legend_label='PI_' + str(sweep))
        ser1.append(s_pi)
        vi_added = True

    outputfile = u.PreparePath(outputfolder + "/plots/large_qchange_gamma=" +
                               str(gamma) + ".png")
    u.SaveDataPlotWithLegends(ser,
                              filename=outputfile,
                              x_axis_name="iterations",
                              y1_axis_name="Max change in state value")
    outputfile = u.PreparePath(outputfolder + "/plots/large_value_gamma=" +
                               str(gamma) + ".png")
    u.SaveDataPlotWithLegends(ser1,
                              filename=outputfile,
                              x_axis_name="iterations",
                              y1_axis_name="Total value accross states")

    ser = []
    ser1 = []
    vi_added = False
    for sweep in pi_sweeps:
        data_vi = u.FilterRows(
            data, lambda x: (x['mdp'] == 'SmallMdpRwTraps') &
            (x['solver'] == 'vi') & (x['gamma'] == gamma))
        data_pi = u.FilterRows(
            data, lambda x:
            (x['mdp'] == 'SmallMdpRwTraps') & (x['solver'] == 'pi') &
            (x['gamma'] == gamma) & (x['maxSweepsPerIteration'] == sweep))
        assert (len(data_vi) == 1)
        assert (len(data_pi) == 1)

        data_vi_qchange = np.array(
            [float(s) for s in data_vi.iloc[0]['cum_rewards'].split(';')])
        data_vi_value = np.array([
            float(s) for s in data_vi.iloc[0]['ran_to_completion'].split(';')
        ])
        data_pi_qchange = np.array(
            [float(s) for s in data_pi.iloc[0]['cum_rewards'].split(';')])
        data_pi_value = np.array([
            float(s) for s in data_pi.iloc[0]['ran_to_completion'].split(';')
        ])
        if (vi_added == False):
            s_vi = u.YSeries(data_vi_qchange,
                             xvalues=np.arange(len(data_vi_qchange)) + 1,
                             line_color='b',
                             plot_legend_label='VI')
            ser.append(s_vi)
        s_pi = u.YSeries(data_pi_qchange,
                         xvalues=np.arange(len(data_pi_qchange)) + 1,
                         line_color=decorations[sweep],
                         plot_legend_label='PI_' + str(sweep))
        ser.append(s_pi)

        if (vi_added == False):
            s_vi = u.YSeries(data_vi_value,
                             xvalues=np.arange(len(data_vi_value)) + 1,
                             line_color='b',
                             plot_legend_label='VI')
            ser1.append(s_vi)
        s_pi = u.YSeries(data_pi_value,
                         xvalues=np.arange(len(data_pi_value)) + 1,
                         line_color=decorations[sweep],
                         plot_legend_label='PI_' + str(sweep))
        ser1.append(s_pi)
        vi_added = True

    outputfile = u.PreparePath(outputfolder + "/plots/small_qchange_gamma=" +
                               str(gamma) + ".png")
    u.SaveDataPlotWithLegends(ser,
                              filename=outputfile,
                              x_axis_name="iterations",
                              y1_axis_name="Max change in state value")
    outputfile = u.PreparePath(outputfolder + "/plots/small_value_gamma=" +
                               str(gamma) + ".png")
    u.SaveDataPlotWithLegends(ser1,
                              filename=outputfile,
                              x_axis_name="iterations",
                              y1_axis_name="Total value accross states")