예제 #1
0
def CalculateThresholds(increments, metric, score_table):
	# select a pseudo-random list of scores to test on the entire database
	query  = 'SELECT urls.domainName, urls.[booter?] FROM ' + score_table + ' '
	query += 'INNER JOIN urls ON urls.domainName = ' + score_table + '.domainName '
	query += 'WHERE urls.[booter?] != \'?\' AND urls.status = \'on\''
	test_urls = storage.Select(query) 
	results = {}
	for i in range(0, increments + 1):
		threshold = i / increments
		# first initialize empty lists on each of the confusion metrics
		true_positives  = []
		true_negatives  = []
		false_positives = []
		false_negatives = []
		# then test for accuracy
		for test_url in test_urls:
			url       = test_url[0]
			is_booter = True if test_url[1] == 'Y' else False
			score     = storage.Select('SELECT ' + metric + ' FROM ' + test_table_to_verification[score_table] + ' WHERE domainName = \'' + url + '\'')[0][0] 
			if   score >= threshold and     is_booter:
				true_positives.append(url)
			elif score >= threshold and not is_booter:
				false_positives.append(url)
			elif score <  threshold and not is_booter:
				true_negatives.append(url)
			elif score <  threshold and     is_booter:
				false_negatives.append(url)
		results[i] = { "tp" : true_positives, "fp" : false_positives, "tn" : true_negatives, "fn" : false_negatives }

	return results
예제 #2
0
 def GetScoreVector(this, table, url):
     result = storage.Select('SELECT * FROM ' + table +
                             ' WHERE domainName = \'' + url + '\'')
     score_vector = []
     for score in result[0][2:]:
         score_vector.append(score)
     return score_vector
예제 #3
0
###########################################################
verDistance = Verifier_Distance()
verBayes = Verifier_Bayes()
verKNN = Verifier_KNN(True)  # use weights
test_table_to_verification = {}
test_table_to_verification['scores'] = 'verification'
test_table_to_verification['test_scores'] = 'verification'
test_table_to_verification['test_scores2'] = 'verification2'
test_table_to_verification['test_scores3'] = 'verification3'
test_tables = ['test_scores', 'test_scores2', 'test_scores3']
# test_tables = ['scores']
for table in test_tables:
    print('== ===========================')
    print('TEST TABLE: ' + table)
    print('== ===========================')
    for url in storage.Select('SELECT domainName FROM ' + table):
        print('VERIFYING URL: ' + url[0])
        verDistance.Euclidean_Distance(table,
                                       test_table_to_verification[table],
                                       url[0])
        verDistance.Squared_Euclidian_Distance(
            table, test_table_to_verification[table], url[0])
        verDistance.Manhattan_Distance(table,
                                       test_table_to_verification[table],
                                       url[0])
        verDistance.Cosine_Distance(table, test_table_to_verification[table],
                                    url[0])
        verDistance.Fractional_Distance(table,
                                        test_table_to_verification[table],
                                        url[0], 0.5)
        verBayes.Calculate(table, test_table_to_verification[table], url[0])
def CheckAccuracy(new_weights):
    # - update classifier with new weights
    verDistance.weights = new_weights
    # verDistance.weights = [16.7194209217615, 2.60829499230467,2.2260372998623,2.90305925398725,5.09533279100191,2.99409304662811,3.1147007335932,5.27110197918494,7.929635390837,3.5367525886694,6.76277542448109,1.62292767902453,0.707530369202495,0.492143303569623,0.644893274893918]

    # first, re-calculate Cosine distance values with new weights (on training set)
    for url in storage.Select('SELECT domainName FROM scores'):
        print('CA_1:VERIFYING URL: ' + url[0])
        verDistance.Cosine_Distance('scores',
                                    test_table_to_verification['scores'],
                                    url[0])

    # second, check Cosine distance accuracy on 100 thresholds and select best T
    # (on scores database)
    threshold_results = accuracy.CalculateThresholds(100, 'cosine', 'scores')
    # - find best result
    best_t = 0.0
    max_accuracy = 0.0
    for i in range(0, len(threshold_results)):
        # - get classification accuracy and error rates
        true_positives = threshold_results[i]['tp']
        true_negatives = threshold_results[i]['tn']
        false_positives = threshold_results[i]['fp']
        false_negatives = threshold_results[i]['fn']
        nr_total = len(true_positives) + len(true_negatives) + len(
            false_positives) + len(false_negatives)
        CAR = (len(true_positives) + len(true_negatives)) / nr_total
        TI_er = len(false_positives) / nr_total
        TII_er = len(false_negatives) / nr_total
        print(CAR)
        if CAR > max_accuracy:
            max_accuracy = CAR
            # - calculate threshold value t in range [0,1] from range [0,100]
            t = i / (len(threshold_results) - 1)
            # - set as current beste threshold value t
            best_t = t
    print('best threshold: ' + str(best_t))
    # final, use selected threshold value T to calculate new accuracy rates / error function and return
    # - determine accuracy on test dataset, so re-calculate Cosine values.
    # for table in test_tables:
    # print()
    # print('CA_3:TEST TABLE: ' + table)
    # print()
    # for url in storage.Select('SELECT domainName FROM ' + table):
    # print('CA_3:VERIFYING URL: ' + url[0])
    # verDistance.Cosine_Distance(table, test_table_to_verification[table], url[0])
    # this is commented, as we now test on the scores table alone

    # - then use best threshold found to calculate accuracy of current metric
    thresholds = [0.0, 0.0, 0.0, best_t, 0.0, 0.0,
                  0.0]  # we only care about Cosine threshold
    accuracy_results = accuracy.CalculateAccuracy(thresholds)
    # - now get accuracy rate of cosine distance with new weights and threshold best_t
    true_positives = accuracy_results['cosine']['tp']
    true_negatives = accuracy_results['cosine']['tn']
    false_positives = accuracy_results['cosine']['fp']
    false_negatives = accuracy_results['cosine']['fn']
    nr_total = true_positives + true_negatives + false_positives + false_negatives
    CAR = (true_positives + true_negatives) / nr_total
    TI_er = false_positives / nr_total
    TII_er = false_negatives / nr_total
    # - finally return newly calculated accuracy
    return [CAR, TI_er, TII_er]
# a = peak y-value.
# b = center of curve
# c = std. deviation
def Gaussian(x=0.5):
    a = 1.0  # Y-value of Gaussian curve peak
    b = 0.5  # center of Gaussian curve
    c = 0.5  # Std. deviation
    value = a * math.exp(-((x - b)**2) / (2 * c**2))
    return value


iterations = 1
while True:
    iterations += 1
    # 1, get current best set of weights and corresponding accuracy rate
    row = storage.Select(
        'SELECT * FROM weight_adaptability ORDER BY car DESC LIMIT 1')
    car = row[0][1]  # best CAR stored in database
    new_weights = list(row[0][4:])
    # 2. update weights based on Guassian distribution curve
    for w in range(0, len(new_weights)):
        random_value = Gaussian(random.uniform(0.0, 1.001))
        new_weights[w] *= random_value
    # 3. create random outliers as to evolute the algorithm and throw it in different directions
    # - do this once every 5 iterations
    if iterations % 5 == 0:
        # select which weight index to alter
        w = random.randrange(0, len(new_weights))
        # increase weight by score between 1.0 - 2.5
        boost = random.uniform(1.0, 2.5)
        print('outlier ' + str(w) + ' boosted by ' + str(boost))
        new_weights[w] *= boost
예제 #6
0
def CalculateAccuracy(thresholds):
	# obtain score results from multiple test datasets
	test_tables = ['test_scores', 'test_scores2', 'test_scores3']
	# test_tables = ['scores']
	test_scores = {}
	for table in test_tables:
		# select a pseudo-random list of scores to test on the entire database
		query  = 'SELECT urls.domainName, urls.[booter?] FROM ' + table + ' '
		query += 'INNER JOIN urls ON urls.domainName = ' + table + '.domainName '
		query += 'WHERE urls.[booter?] != \'?\' AND urls.status = \'on\''
		test_scores[table] = storage.Select(query)

	# for each test_score dataset, calculate the accuracy metrics
	test_results = {}
	for test_table in test_scores:
		test_results[test_table] =  {}
		true_positives  = {} # key = metric, value = list of urls
		true_negatives  = {}
		false_positives = {}
		false_negatives = {}
		# first initialize empty lists on each of the confusion metrics
		for metric in metrics:
			test_results[test_table][metric] = {}
			true_positives[metric]  = []
			true_negatives[metric]  = []
			false_positives[metric] = []
			false_negatives[metric] = []
		# then test for accuracy
		for test_url in test_scores[test_table]:
			url       = test_url[0]
			is_booter = True if test_url[1] == 'Y' else False
			scores    = storage.Select('SELECT * FROM ' + test_table_to_verification[test_table] + ' WHERE domainName = \'' + url + '\'')[0][2:] 
			for i in range(0, len(scores)):
				metric    = metrics[i]
				score     = scores[i]
				threshold = thresholds[i]
				if   score >= threshold and     is_booter:
					true_positives[metric].append(url)
				elif score >= threshold and not is_booter:
					false_positives[metric].append(url)
				elif score <  threshold and not is_booter:
					true_negatives[metric].append(url)
				elif score <  threshold and     is_booter:
					false_negatives[metric].append(url)
		# then store results
		for metric in metrics:
			test_results[test_table][metric] = { 
				"tp" : true_positives[metric], 
				"tn" : true_negatives[metric], 
				"fp" : false_positives[metric], 
				"fn" : false_negatives[metric] 
			}

	# now average results and return final accuracy scores
	final_results = {}
	for metric in metrics:
		final_results[metric] = {}
		final_results[metric]["tp"] = 0
		final_results[metric]["tn"] = 0
		final_results[metric]["fp"] = 0
		final_results[metric]["fn"] = 0
	for test_table in test_results:
		for metric in metrics:
			true_positives  = test_results[test_table][metric]["tp"]
			true_negatives  = test_results[test_table][metric]["tn"]
			false_positives = test_results[test_table][metric]["fp"]
			false_negatives = test_results[test_table][metric]["fn"]
			final_results[metric]["tp"] += len(true_positives)
			final_results[metric]["tn"] += len(true_negatives)
			final_results[metric]["fp"] += len(false_positives)
			final_results[metric]["fn"] += len(false_negatives)
	for metric in metrics:
		final_results[metric]["tp"] /= len(test_tables)
		final_results[metric]["tn"] /= len(test_tables)
		final_results[metric]["fp"] /= len(test_tables)
		final_results[metric]["fn"] /= len(test_tables)

	return final_results 
예제 #7
0
 def __init__(this):
     # get weight vector from database
     weights = storage.Select('SELECT * FROM weights')
     this.weights = weights[0][
         2:]  # do not include domainName and lastUpdate column