def CalculateThresholds(increments, metric, score_table): # select a pseudo-random list of scores to test on the entire database query = 'SELECT urls.domainName, urls.[booter?] FROM ' + score_table + ' ' query += 'INNER JOIN urls ON urls.domainName = ' + score_table + '.domainName ' query += 'WHERE urls.[booter?] != \'?\' AND urls.status = \'on\'' test_urls = storage.Select(query) results = {} for i in range(0, increments + 1): threshold = i / increments # first initialize empty lists on each of the confusion metrics true_positives = [] true_negatives = [] false_positives = [] false_negatives = [] # then test for accuracy for test_url in test_urls: url = test_url[0] is_booter = True if test_url[1] == 'Y' else False score = storage.Select('SELECT ' + metric + ' FROM ' + test_table_to_verification[score_table] + ' WHERE domainName = \'' + url + '\'')[0][0] if score >= threshold and is_booter: true_positives.append(url) elif score >= threshold and not is_booter: false_positives.append(url) elif score < threshold and not is_booter: true_negatives.append(url) elif score < threshold and is_booter: false_negatives.append(url) results[i] = { "tp" : true_positives, "fp" : false_positives, "tn" : true_negatives, "fn" : false_negatives } return results
def GetScoreVector(this, table, url): result = storage.Select('SELECT * FROM ' + table + ' WHERE domainName = \'' + url + '\'') score_vector = [] for score in result[0][2:]: score_vector.append(score) return score_vector
########################################################### verDistance = Verifier_Distance() verBayes = Verifier_Bayes() verKNN = Verifier_KNN(True) # use weights test_table_to_verification = {} test_table_to_verification['scores'] = 'verification' test_table_to_verification['test_scores'] = 'verification' test_table_to_verification['test_scores2'] = 'verification2' test_table_to_verification['test_scores3'] = 'verification3' test_tables = ['test_scores', 'test_scores2', 'test_scores3'] # test_tables = ['scores'] for table in test_tables: print('== ===========================') print('TEST TABLE: ' + table) print('== ===========================') for url in storage.Select('SELECT domainName FROM ' + table): print('VERIFYING URL: ' + url[0]) verDistance.Euclidean_Distance(table, test_table_to_verification[table], url[0]) verDistance.Squared_Euclidian_Distance( table, test_table_to_verification[table], url[0]) verDistance.Manhattan_Distance(table, test_table_to_verification[table], url[0]) verDistance.Cosine_Distance(table, test_table_to_verification[table], url[0]) verDistance.Fractional_Distance(table, test_table_to_verification[table], url[0], 0.5) verBayes.Calculate(table, test_table_to_verification[table], url[0])
def CheckAccuracy(new_weights): # - update classifier with new weights verDistance.weights = new_weights # verDistance.weights = [16.7194209217615, 2.60829499230467,2.2260372998623,2.90305925398725,5.09533279100191,2.99409304662811,3.1147007335932,5.27110197918494,7.929635390837,3.5367525886694,6.76277542448109,1.62292767902453,0.707530369202495,0.492143303569623,0.644893274893918] # first, re-calculate Cosine distance values with new weights (on training set) for url in storage.Select('SELECT domainName FROM scores'): print('CA_1:VERIFYING URL: ' + url[0]) verDistance.Cosine_Distance('scores', test_table_to_verification['scores'], url[0]) # second, check Cosine distance accuracy on 100 thresholds and select best T # (on scores database) threshold_results = accuracy.CalculateThresholds(100, 'cosine', 'scores') # - find best result best_t = 0.0 max_accuracy = 0.0 for i in range(0, len(threshold_results)): # - get classification accuracy and error rates true_positives = threshold_results[i]['tp'] true_negatives = threshold_results[i]['tn'] false_positives = threshold_results[i]['fp'] false_negatives = threshold_results[i]['fn'] nr_total = len(true_positives) + len(true_negatives) + len( false_positives) + len(false_negatives) CAR = (len(true_positives) + len(true_negatives)) / nr_total TI_er = len(false_positives) / nr_total TII_er = len(false_negatives) / nr_total print(CAR) if CAR > max_accuracy: max_accuracy = CAR # - calculate threshold value t in range [0,1] from range [0,100] t = i / (len(threshold_results) - 1) # - set as current beste threshold value t best_t = t print('best threshold: ' + str(best_t)) # final, use selected threshold value T to calculate new accuracy rates / error function and return # - determine accuracy on test dataset, so re-calculate Cosine values. # for table in test_tables: # print() # print('CA_3:TEST TABLE: ' + table) # print() # for url in storage.Select('SELECT domainName FROM ' + table): # print('CA_3:VERIFYING URL: ' + url[0]) # verDistance.Cosine_Distance(table, test_table_to_verification[table], url[0]) # this is commented, as we now test on the scores table alone # - then use best threshold found to calculate accuracy of current metric thresholds = [0.0, 0.0, 0.0, best_t, 0.0, 0.0, 0.0] # we only care about Cosine threshold accuracy_results = accuracy.CalculateAccuracy(thresholds) # - now get accuracy rate of cosine distance with new weights and threshold best_t true_positives = accuracy_results['cosine']['tp'] true_negatives = accuracy_results['cosine']['tn'] false_positives = accuracy_results['cosine']['fp'] false_negatives = accuracy_results['cosine']['fn'] nr_total = true_positives + true_negatives + false_positives + false_negatives CAR = (true_positives + true_negatives) / nr_total TI_er = false_positives / nr_total TII_er = false_negatives / nr_total # - finally return newly calculated accuracy return [CAR, TI_er, TII_er]
# a = peak y-value. # b = center of curve # c = std. deviation def Gaussian(x=0.5): a = 1.0 # Y-value of Gaussian curve peak b = 0.5 # center of Gaussian curve c = 0.5 # Std. deviation value = a * math.exp(-((x - b)**2) / (2 * c**2)) return value iterations = 1 while True: iterations += 1 # 1, get current best set of weights and corresponding accuracy rate row = storage.Select( 'SELECT * FROM weight_adaptability ORDER BY car DESC LIMIT 1') car = row[0][1] # best CAR stored in database new_weights = list(row[0][4:]) # 2. update weights based on Guassian distribution curve for w in range(0, len(new_weights)): random_value = Gaussian(random.uniform(0.0, 1.001)) new_weights[w] *= random_value # 3. create random outliers as to evolute the algorithm and throw it in different directions # - do this once every 5 iterations if iterations % 5 == 0: # select which weight index to alter w = random.randrange(0, len(new_weights)) # increase weight by score between 1.0 - 2.5 boost = random.uniform(1.0, 2.5) print('outlier ' + str(w) + ' boosted by ' + str(boost)) new_weights[w] *= boost
def CalculateAccuracy(thresholds): # obtain score results from multiple test datasets test_tables = ['test_scores', 'test_scores2', 'test_scores3'] # test_tables = ['scores'] test_scores = {} for table in test_tables: # select a pseudo-random list of scores to test on the entire database query = 'SELECT urls.domainName, urls.[booter?] FROM ' + table + ' ' query += 'INNER JOIN urls ON urls.domainName = ' + table + '.domainName ' query += 'WHERE urls.[booter?] != \'?\' AND urls.status = \'on\'' test_scores[table] = storage.Select(query) # for each test_score dataset, calculate the accuracy metrics test_results = {} for test_table in test_scores: test_results[test_table] = {} true_positives = {} # key = metric, value = list of urls true_negatives = {} false_positives = {} false_negatives = {} # first initialize empty lists on each of the confusion metrics for metric in metrics: test_results[test_table][metric] = {} true_positives[metric] = [] true_negatives[metric] = [] false_positives[metric] = [] false_negatives[metric] = [] # then test for accuracy for test_url in test_scores[test_table]: url = test_url[0] is_booter = True if test_url[1] == 'Y' else False scores = storage.Select('SELECT * FROM ' + test_table_to_verification[test_table] + ' WHERE domainName = \'' + url + '\'')[0][2:] for i in range(0, len(scores)): metric = metrics[i] score = scores[i] threshold = thresholds[i] if score >= threshold and is_booter: true_positives[metric].append(url) elif score >= threshold and not is_booter: false_positives[metric].append(url) elif score < threshold and not is_booter: true_negatives[metric].append(url) elif score < threshold and is_booter: false_negatives[metric].append(url) # then store results for metric in metrics: test_results[test_table][metric] = { "tp" : true_positives[metric], "tn" : true_negatives[metric], "fp" : false_positives[metric], "fn" : false_negatives[metric] } # now average results and return final accuracy scores final_results = {} for metric in metrics: final_results[metric] = {} final_results[metric]["tp"] = 0 final_results[metric]["tn"] = 0 final_results[metric]["fp"] = 0 final_results[metric]["fn"] = 0 for test_table in test_results: for metric in metrics: true_positives = test_results[test_table][metric]["tp"] true_negatives = test_results[test_table][metric]["tn"] false_positives = test_results[test_table][metric]["fp"] false_negatives = test_results[test_table][metric]["fn"] final_results[metric]["tp"] += len(true_positives) final_results[metric]["tn"] += len(true_negatives) final_results[metric]["fp"] += len(false_positives) final_results[metric]["fn"] += len(false_negatives) for metric in metrics: final_results[metric]["tp"] /= len(test_tables) final_results[metric]["tn"] /= len(test_tables) final_results[metric]["fp"] /= len(test_tables) final_results[metric]["fn"] /= len(test_tables) return final_results
def __init__(this): # get weight vector from database weights = storage.Select('SELECT * FROM weights') this.weights = weights[0][ 2:] # do not include domainName and lastUpdate column