Exemplo n.º 1
0
def run(args, features, rank_matrix, plot_dict, outlier_ids):
    # Quick Access Variables
    N_val = args.num_outliers
    B_val = args.budget
    P_val = float(args.p_val)

    # Create graph between outliers and plots
    cprint("Generating Bipartite Graph")
    scaled_matrix, normal_matrix = generate_graph(P_val, rank_matrix,
                                                  outlier_ids)
    saved_graph = Graph(scaled_matrix)
    print_ok("Graph Generated Successfully")

    # Run appropriate algorithm to get list of selected graphs
    scatter_plots = len(plot_dict)
    file = open(args.logfolder + args.logfile, 'w')
    if args.baseline:
        algos = ["LookOut", "TopK", "Random"]
    else:
        algos = ["LookOut"]
    for algo in algos:
        cprint("\nIteration " + algo, RED)
        graph = copy.deepcopy(saved_graph)
        print("N_val = ", N_val, " Budget = ", B_val)

        start_time = time.time()
        cprint("Running " + algo + " Algorithm")
        plots = LookOut(graph, B_val, algo)
        frequencies = generate_frequency_list(plots, scaled_matrix)
        print_ok(algo + " Complete")
        elapsed_time = time.time() - start_time

        cprint("Saving Plots")
        coverage, max_coverage = get_coverage(plots, N_val, normal_matrix)
        print("\t-> Total Plots Generated = ", end='')
        cprint(scatter_plots, OKBLUE)
        print("\t-> Total Plots Chosen = ", end='')
        cprint(len(plots), OKBLUE)
        print("\t-> Coverage = ", end='')
        cprint("{0:.3f} / {1:.3f}".format(coverage, max_coverage), OKBLUE)

        # Save selected plots as png images
        for i, plot in enumerate(plots):
            pair = plot_dict[plot]
            fig = scatter_outliers(features[pair[0]], features[pair[1]],
                                   frequencies, plot)
            fname = args.plotfolder + '{0}-{1}-{2}-{3}.png'.format(
                algo, N_val, B_val, i)
            fig.savefig(fname)
            plt.close(fig)
        print_ok("Plots Saved")

        file.write("N_val " + str(N_val) + "\tBudget " + str(B_val) +
                   "\tAlgo " + algo + "\tTime Taken = " + str(elapsed_time) +
                   "\tCoverage = " + str(coverage) + "%" + "\n")
    file.close()
    cprint("Finished")
Exemplo n.º 2
0
# Get Outliers Scores if using iForests
if generate_iForest:
    cprint("Generating Graph File")
    features = combine_features([
        eval(F)
        for F in identity_features + continuous_features + discrete_features
    ])
    iForest(features)
    print_ok("iForest Generation Complete")

file = open(filefolder + "Log.txt", 'w')
N_list = [10, 20, 50, 75, 100]
for N_val in N_list:
    # Create graph between outliers and plots
    cprint("Generating Graph File")
    ranklist.generate_graph(P_val, N_val)
    print_ok("Graph File Generated")
    # Run plotSpot to get selected graphs
    Budget = [1, 2, 3, 4, 5, 6]
    for B in Budget:
        for algo in ["SpellOut", "Greedy", "G_Norm"]:
            print "N_val = ", N_val, " Budget = ", B, " ALGO = ", algo
            start_time = time.time()
            cprint("Running PlotSpot Algorithm")
            plots = plotSpot(B, algo)
            print_ok("PlotSpot Complete")
            elapsed_time = time.time() - start_time
            coverage = get_coverage(plots)
            # Save selected plots in pdf
            cprint("Saving Plots")
            total_plots = scatter_plots
Exemplo n.º 3
0
pp = PdfPages(plotfolder + 'scatterplots--full.pdf')
for j, features in enumerate(feature_pairs):
    X, Y = features[0], features[1]
    print j, 'of', len(feature_pairs)
    pair_features = np.array([INFO[features[0]], INFO[features[1]]]).T
    forest = IsolationForest(
        n_estimators=500, max_samples=1000,
        random_state=0, contamination=num_outlier / 343546.0 # number of nodes
    )
    fig = scatter_plot(INFO[X], INFO[Y], INFO['IDs'], discription[Y],
                                  discription[X],
                                  discription[Y] + ' vs ' + discription[X],
                                  compare_value[X])
    forest.fit(pair_features)
    scores = forest.decision_function(pair_features[outlier_ids, :])
    rank_list = sorted([(outliers[i], -s) for (i, s) in enumerate(scores)],
                       key=lambda x: x[1], reverse=True)
    rank_matrix.append(rank_list)
pp.close()


#  runs, properly till this, why is generate_graph returning nothing?
scaled_matrix, normal_matrix = ranklist.generate_graph(P_val, num_outlier, rank_matrix)
plots = plotSpot(budget, scaled_matrix, "SpellOut")
frequencies = generate_frequency_list(plots, scaled_matrix)
for i, plot in enumerate(plots):
    fig = scatter_outliers(plot, INFO['IDs'], frequencies)
    fname = 'discoveries/DBLP--full-{0}-{1}-{2}.png'.format(num_outlier,
                                                          budget, i)
    fig.savefig(fname)
Exemplo n.º 4
0
	cprint("Generating Graph File")
	features = combine_features([eval(F) for F in identity_features + continuous_features + discrete_features])
	iForest(features)
	print_ok("iForest Generation Complete")

file = open(filefolder + logfile, 'w')

# Use outlier list if provided
if not generate_iForest and not merge_ranklists:
	N_list = [len(global_outlier_list)]

count = 0
for N_val in N_list:
	# Create graph between outliers and plots
	cprint("Generating Graph File")
	scaled_matrix, normal_matrix = ranklist.generate_graph(P_val, N_val, rank_matrix)
	print_ok("Graph File Generated")
	# Run plotSpot to get selected graphs
	for B in Budget:
		for algo in ["SpellOut", "TopK"]:
			if algo != "SpellOut"  and not baseline:
				continue
			
			count += 1
			cprint("\nIteration " + str(count), RED)
			print "N_val = ", N_val, " Budget = ", B, " ALGO = ", algo
			
			start_time = time.time()
			cprint ("Running PlotSpot Algorithm")
			plots = plotSpot(B, scaled_matrix, algo)
			frequencies = generate_frequency_list(plots, scaled_matrix)
Exemplo n.º 5
0
        # start clock
        data = copy.deepcopy(raw_data)
        start_time = time.time()

        # feature extraction
        data = data_transform.read_data(data)
        users = data.groupby('SOURCE')
        IDs = map(int, users.groups.keys())
        destinations = data.groupby('DESTINATION')
        AMOUNT = fix_zero_error(users['WEIGHT'].sum().values.tolist())
        DEST = fix_zero_error(users['DESTINATION'].nunique().values.tolist())
        LIFE = fix_zero_error(users['LIFETIME'].first().values.tolist())
        IN_EDGE = fix_zero_error(users['WEIGHT'].count().values.tolist())
        IAT_VAR = fix_zero_error(users['IAT_VAR'].first().values.tolist())

        # select plots
        features = combine_features([
            eval(F) for F in identity_features + continuous_features +
            discrete_features
        ])
        iForest(features)
        ranklist.generate_graph(p_val, num_outlier)
        plots = plotSpot(budget, 'SpellOut')

        # end clock
        time_elapsed = time.time() - start_time
        running_times[num_edge].append(time_elapsed)
        print num_edge, t, time_elapsed

pickle.dump(running_times, open('results/scalability_edges.pkl', 'wb'))
Exemplo n.º 6
0
time_plots = 0  # Count of the number of time plots generated
""" Generate Band Plots """
band_plots = 0  # Count of the number of band plots generated
""" PlotSPOT Algorithm """
# Get Outliers Scores if using iForests
if generate_iForest:
    cprint("Generating Graph File")
    features = combine_features([
        eval(F)
        for F in identity_features + continuous_features + discrete_features
    ])
    iForest(features)
    print_ok("iForest Generation Complete")
# Create graph between outliers and plots
cprint("Generating Graph File")
ranklist.generate_graph()
print_ok("Graph File Generated")
# Run plotSpot to get selected graphs
cprint("Running PlotSpot Algorithm")
plots = plotSpot()
print_ok("PlotSpot Complete")
# Save selected plots in pdf
cprint("Saving Plots")
total_plots = get_total_plots(scatter_plots, ccdf_plots, histograms,
                              time_plots, band_plots)
print "\t-> Total Plots Generated = ",
cprint(total_plots, OKBLUE)
print "\t-> Total Plots Chosen = ",
cprint(len(plots), OKBLUE)
print "\t-> Compression = ",
cprint("{0:.2f} %".format((1 - float(len(plots)) / total_plots) * 100), OKBLUE)