def return_pca_debiasing(models, arguments, content): logging.info("APP-DE: Forwarding to related definitions") database = arguments['space'] augment_flag = arguments['augments'] target1, target2, attr1, attr2, augments1, augments2, augments3, augments4 = JSONFormatter.retrieve_vectors_debiasing( content, database, augment_flag) target1, target2 = calculation.check_sizes(target1, target2) attr1, attr2 = calculation.check_sizes(attr1, attr2) logging.info("APP: Final retrieved set sizes: T1=" + str(len(target1)) + " T2=" + str(len(target2)) + " A1=" + str( len(attr1)) + " A2=" + str(len(attr2))) if len(target1) == 0 or len(target2) == 0 or len(attr1) == 0 or len(attr2) == 0: logging.info("APP: Stopped, no values found in database") return jsonify(message="ERROR: No values found in database."), 404 logging.info("APP: Debiasing process started") res1, res2, res3, res4 = {}, {}, {}, {} try: if models is None: res1, res2, res3, res4 = gbdd.generalized_bias_direction_debiasing(target1, target2, attr1, attr2, augments1, augments2, augments3, augments4) if models == 'gbdd': res1, res2, res3, res4 = gbdd.generalized_bias_direction_debiasing(target1, target2, attr1, attr2, augments1, augments2, augments3, augments4) if models == 'bam': res1, res2, res3, res4 = bam.bias_alignment_model(target1, target2, attr1, attr2, augments1, augments2, augments3, augments4) if models == 'gbddxbam': res1, res2, res3, res4 = gbdd.generalized_bias_direction_debiasing(target1, target2, attr1, attr2, augments1, augments2, augments3, augments4) res1, res2, res3, res4 = bam.bias_alignment_model(res1, res2, res3, res4, augments1, augments2, augments3, augments4) if models == 'bamxgbdd': res1, res2, res3, res4 = bam.bias_alignment_model(target1, target2, attr1, attr2, augments1, augments2, augments3, augments4) res1, res2, res3, res4 = gbdd.generalized_bias_direction_debiasing(res1, res2, res3, res4, augments1, augments2, augments3, augments4) target1_copy, target2_copy = calculation.create_duplicates(target1, target2) attr1_copy, attr2_copy = calculation.create_duplicates(attr1, attr2) res1_copy, res2_copy, res3_copy, res4_copy = calculation.create_duplicates(res1, res2, res3, res4) biased_terms = calculation.concatenate_dicts(target1_copy, target2_copy, attr1_copy, attr2_copy) debiased_terms = calculation.concatenate_dicts(res1_copy, res2_copy, res3_copy, res4_copy) biased_pca = calculation.principal_componant_analysis(target1, target2, attr1, attr2) debiased_pca = calculation.principal_componant_analysis(res1, res2, res3, res4) response = json.dumps( {"EmbeddingSpace": database, "Model": models, "BiasedVectorsPCA": JSONFormatter.dict_to_json(biased_pca), "DebiasedVectorsPCA": JSONFormatter.dict_to_json(debiased_pca), "BiasedVecs:": JSONFormatter.dict_to_json(biased_terms), "DebiasedVecs": JSONFormatter.dict_to_json(debiased_terms)}) except: return jsonify(message="DEBIASING ERROR"), 500 logging.info("APP: Debiasing process finished") return response, 200
def k_means_clustering(target_set1, target_set2, accuracy=50): logging.info("KMeans: Calculation started with " + str(accuracy) + " iterations") target1, target2 = calculation.create_duplicates(target_set1, target_set2) target1 = calculation.transform_dict_to_list(target1) target2 = calculation.transform_dict_to_list(target2) vector_list = target1 + target2 logging.info("KMeans: Vector dictionaries and lists prepared successfully") gold_standard1 = [1] * len(target1) + [0] * len(target2) gold_standard2 = [0] * len(target1) + [1] * len(target2) cluster = list(zip(vector_list, gold_standard1, gold_standard2)) scores = [] logging.info("KMeans: Cluster & Gold Standards created") for i in range(accuracy): random.shuffle(cluster) k_means = KMeans(n_clusters=2, random_state=0, init='k-means++').fit(numpy.array([x[0] for x in cluster])) labels = k_means.labels_ accuracy1 = len([i for i in range(len(labels)) if labels[i] == cluster[i][1]]) / len(labels) accuracy2 = len([i for i in range(len(labels)) if labels[i] == cluster[i][2]]) / len(labels) scores.append(max(accuracy1, accuracy2)) result = sum(scores) / len(scores) logging.info("KMeans: Finished calculation") logging.info("KMeans: Results: " + str(result)) return result
def k_means_clustering(target_set1, target_set2, accuracy=50): logging.info("Eval-Engine: K-Means++ clustering started") target1, target2 = calculation.create_duplicates(target_set1, target_set2) target1 = calculation.transform_dict_to_list(target1) target2 = calculation.transform_dict_to_list(target2) vector_list = numpy.concatenate((target1, target2), axis=0) gold_standard1 = [1] * len(target1) + [0] * len(target2) gold_standard2 = [0] * len(target1) + [1] * len(target2) cluster = list(zip(vector_list, gold_standard1, gold_standard2)) scores = [] for i in range(accuracy): random.shuffle(cluster) k_means = KMeans(n_clusters=2, random_state=0, init='k-means++').fit( numpy.array([x[0] for x in cluster])) labels = k_means.labels_ accuracy1 = len([ i for i in range(len(labels)) if labels[i] == cluster[i][1] ]) / len(labels) accuracy2 = len([ i for i in range(len(labels)) if labels[i] == cluster[i][2] ]) / len(labels) scores.append(max(accuracy1, accuracy2)) result = sum(scores) / len(scores) logging.info("Eval-Engine: K-Means++ clustering score: " + str(result)) return result
def embedding_coherence_test(test_set1, test_set2, attribute_set): logging.info("ECT: Calculation started") # Create duplicates test1, test2, argument = calculation.create_duplicates( test_set1, test_set2, attribute_set) # Transform vector sets in lists test_list1 = calculation.transform_dict_to_list(test1) test_list2 = calculation.transform_dict_to_list(test2) arg_list = calculation.transform_dict_to_list(argument) logging.info("ECT: Vector dictionaries/lists prepared successfully") mean_target_vector1 = calculation.target_set_mean_vector(test_list1) mean_target_vector2 = calculation.target_set_mean_vector(test_list2) logging.info("ECT: Target set mean vectors calculated successfully") array_sim1 = [] array_sim2 = [] for i in range(len(arg_list)): memory = arg_list[i] cos_sim1 = calculation.cosine_similarity(mean_target_vector1, memory) array_sim1.append(cos_sim1) cos_sim2 = calculation.cosine_similarity(mean_target_vector2, memory) array_sim2.append(cos_sim2) value_array, p_value = spearmanr(array_sim1, array_sim2) logging.info("ECT: Calculated successfully:") logging.info("ECT: Results: " + str(value_array) + " p: " + str(p_value)) return value_array, p_value
def word_embedding_association_test(target_set1, target_set2, argument_set1, argument_set2, accuracy=100): logging.info("WEAT: Started calculation") target1, target2, arg1, arg2 = calculation.create_duplicates( target_set1, target_set2, argument_set1, argument_set2) target1, target2, arg1, arg2 = calculation.transform_multiple_dicts_to_lists( target1, target2, arg1, arg2) target_list = target1 + target2 logging.info("WEAT: Vector dictionaries and lists prepared successfully") # Calculate effect size effect_size = effect_size_calculation(target_list, target1, target2, arg1, arg2) # Calculate p_value logging.info("WEAT: Started p-value calculation") s_b_e = differential_association(target1, target2, arg1, arg2) s_b_e_all = sum_up_diff_ass_all_permutations(target_list, arg1, arg2, accuracy) p_value = p_value_calculation(s_b_e, s_b_e_all) logging.info("WEAT: Finished p-value calculation with result " + str(p_value)) logging.info("WEAT: Finished calculation") logging.info("WEAT: Results: effect-size: " + str(effect_size) + " p-value: " + str(p_value)) return effect_size, p_value
def full_analogy(word1, word2, word3): word1_copy, word2_copy, word3_copy = calculation.create_duplicates( word1, word2, word3) v1 = get_vector_from_small_dict(word1_copy) v2 = get_vector_from_small_dict(word2_copy) v3 = get_vector_from_small_dict(word3_copy) target_vector = numpy.array(v1) + numpy.array(v2) - numpy.array(v3) result_word = database_handler.word_for_nearest_vector(target_vector) return result_word
def generalized_bias_direction_debiasing(target_set1, target_set2, attributes1, attributes2, augments1, augments2, augments3, augments4): logging.info("GBDD: Debiasing started") target1_copy, target2_copy = calculation.create_duplicates(target_set1, target_set2) attributes1_copy, attributes2_copy = calculation.create_duplicates(attributes1, attributes2) augments1_copy, augments2_copy = calculation.create_duplicates(augments1, augments2) augments3_copy, augments4_copy = calculation.create_duplicates(augments3, augments4) augments1_copy = calculation.concatenate_dicts(augments1_copy, augments3_copy) augments2_copy = calculation.concatenate_dicts(augments2_copy, augments4_copy) aug1, aug2 = calculation.transform_multiple_dicts_to_lists(augments1_copy, augments2_copy) logging.info("GBDD: Vector dictionaries and lists prepared successfully") gbdv = calculate_bias_direction_matrix(aug1, aug2) new_target1 = calculate_gbdd(gbdv, target1_copy) new_target2 = calculate_gbdd(gbdv, target2_copy) new_attributes1 = calculate_gbdd(gbdv, attributes1_copy) new_attributes2 = calculate_gbdd(gbdv, attributes2_copy) logging.info("GBDD: Debiasing finished successfully") return new_target1, new_target2, new_attributes1, new_attributes2
def return_eval_ect(target_vectors1, target_vectors2, attr_vectors1, attr_vectors2, database): logging.info("APP-BE: Starting ECT evaluation") arg_vecs = calculation.concatenate_dicts(calculation.create_duplicates(attr_vectors1), calculation.create_duplicates(attr_vectors2)) ect_value, p_value = ect.embedding_coherence_test(target_vectors1, target_vectors2, arg_vecs) ect_value1, p_value1 = ect.embedding_coherence_test(target_vectors1, target_vectors2, attr_vectors1) ect_value2, p_value2 = ect.embedding_coherence_test(target_vectors1, target_vectors2, attr_vectors2) logging.info("APP-BE: ECT finished successfully") response = json.dumps( {"EmbeddingSpace": database, "EvaluationMethods": "all", "EctValue": ect_value, "EctPValue": p_value, "EctValue1": ect_value1, "EctPValue1": p_value1, "EctValue2": ect_value2, "EctPValue2": p_value2, "T1": JSONFormatter.dict_to_json(target_vectors1), "T2": JSONFormatter.dict_to_json(target_vectors2), "A1": JSONFormatter.dict_to_json(attr_vectors1), "A2": JSONFormatter.dict_to_json(attr_vectors2) }) # response = jsonify(ect_value1=ect_value1, p_value1=p_value1, p_value2=p_value2, # ect_value2=ect_value2) logging.info("APP-BE: Results: " + str(response)) return response
def return_eval_all(target_vectors1, target_vectors2, attr_vectors1, attr_vectors2, database): logging.info("APP-BE: Starting all evaluations") try: arg_vecs = calculation.concatenate_dicts(calculation.create_duplicates(attr_vectors1), calculation.create_duplicates(attr_vectors2)) ect_value, p_value = ect.embedding_coherence_test(target_vectors1, target_vectors2, arg_vecs) ect_value1, p_value1 = ect.embedding_coherence_test(target_vectors1, target_vectors2, attr_vectors1) ect_value2, p_value2 = ect.embedding_coherence_test(target_vectors1, target_vectors2, attr_vectors2) bat_result = bat.bias_analogy_test(target_vectors1, target_vectors2, attr_vectors1, attr_vectors2) # bat_result = 'Currently not available' weat_effect_size, weat_p_value = weat.word_embedding_association_test(target_vectors1, target_vectors2, attr_vectors1, attr_vectors2) kmeans = k_means.k_means_clustering(target_vectors1, target_vectors2) logging.info("APP-BE: Evaluations finished successfully") response = json.dumps( {"EmbeddingSpace": database, "EvaluationMethods": "all", "EctValue": ect_value, "EctPValue": p_value, "EctValue1": ect_value1, "EctPValue1": p_value1, "EctValue2": ect_value2, "EctPValue2": p_value2, "BatValue": bat_result, "WeatEffectSize": weat_effect_size, "WeatPValue": weat_p_value, "KmeansValue": kmeans, "T1": JSONFormatter.dict_keys_to_string(target_vectors1), "T2": JSONFormatter.dict_keys_to_string(target_vectors2), "A1": JSONFormatter.dict_keys_to_string(attr_vectors1), "A2": JSONFormatter.dict_keys_to_string(attr_vectors2) }) # response = jsonify(ect_value1=ect_value1, p_value1=p_value1, p_value2=p_value2, ect_value2=ect_value2, # bat_result=bat_result, weat_effect_size=weat_effect_size, weat_pvalue=weat_p_value, # k_means=kmeans) logging.info("APP-BE: Results: " + str(response)) return response except RuntimeWarning as rw: print(rw) return jsonify(message="Internal Calculation Error")
def load_multiple_augments(word_list, sourcefile): fin = io.open(sourcefile, 'r', encoding='utf-8', newline='\n', errors='ignore') database = 'fasttext' n, d = map(int, fin.readline().split()) source_dict = database_handler.get_multiple_vectors_from_db( word_list, database) source_dict2 = calculation.create_duplicates(source_dict) source_vectors = [] for word in source_dict2: source_vectors.append(numpy.array(list(source_dict2[word]))) augmentations = {} for i in range(len(source_dict)): cosinesim = {} for line in fin: tokens = line.rstrip().split(' ') vector = [] for i in range(1, len(tokens)): vector.append(float(tokens[i])) if tokens[0] != source_dict[i]: cosinesim[tokens[0]] = calculation.cosine_similarity( source_vectors[i], numpy.array(vector)) maximum1 = max(cosinesim, key=lambda k: cosinesim[k]) cosinesim.pop(maximum1) maximum2 = max(cosinesim, key=lambda k: cosinesim[k]) cosinesim.pop(maximum2) maximum3 = max(cosinesim, key=lambda k: cosinesim[k]) cosinesim.pop(maximum3) maximum4 = max(cosinesim, key=lambda k: cosinesim[k]) cosinesim.pop(maximum4) augmentations[source_dict[i]] = [ maximum1, maximum2, maximum3, maximum4 ] return augmentations
def bias_alignment_model(target_set1, target_set2, attributes1, attributes2, augments1, augments2, augments3, augments4): target1_copy, target2_copy = calculation.create_duplicates( target_set1, target_set2) attr1_copy, attr2_copy = calculation.create_duplicates( attributes1, attributes2) augments1_copy, augments2_copy = calculation.create_duplicates( augments1, augments2) augments3_copy, augments4_copy = calculation.create_duplicates( augments3, augments4) term_list, vector_list = [], [] augments_list, aug_vecs, aug1_list, aug2_list, = [], [], [], [], for word in target1_copy: term_list.append(word) vector_list.append(list(target1_copy[word])) for word in target2_copy: term_list.append(word) vector_list.append(list(target2_copy[word])) for word in attr1_copy: term_list.append(word) vector_list.append(numpy.array(list(attr1_copy[word]))) for word in attr2_copy: term_list.append(word) vector_list.append(numpy.array(list(attr2_copy[word]))) for word in augments1: augments_list.append(word) aug1_list.append(numpy.array(list(augments1_copy[word]))) for word in augments2: augments_list.append(word) aug2_list.append(numpy.array(list(augments2_copy[word]))) for word in augments3: augments_list.append(word) aug1_list.append(numpy.array(list(augments3_copy[word]))) for word in augments4: augments_list.append(word) aug2_list.append(numpy.array(list(augments4_copy[word]))) term_pairs = [] for i in range(len(aug1_list)): for j in range(len(aug2_list)): term_pairs.append([aug1_list[i], aug2_list[j]]) x_t1 = numpy.array([term_pairs[i][0] for i in range(len(term_pairs))]) x_t2 = numpy.array([term_pairs[i][1] for i in range(len(term_pairs))]) multi = numpy.matmul(numpy.transpose(x_t1), x_t2) u, s, vh = numpy.linalg.svd(multi) w_matrix = numpy.matmul(u, vh) new_x_matrix = numpy.matmul(vector_list, w_matrix) result_matrix = 0.5 * (numpy.array(vector_list) + new_x_matrix) result_dict = {} for i in range(len(term_list)): result_dict[term_list[i]] = result_matrix[i] t1, t2, a1, a2 = {}, {}, {}, {} for word in result_dict: if word in target1_copy: t1[word] = result_dict[word] if word in target2_copy: t2[word] = result_dict[word] if word in attr1_copy: a1[word] = result_dict[word] if word in attr2_copy: a2[word] = result_dict[word] return t1, t2, a1, a2
def debias_net(target_set1, target_set2, argument_set, lambda_value=0.2): target1, target2, argument = calculation.create_duplicates( target_set1, target_set2, argument_set) t1_list, t2_list, arg_list = calculation.transform_multiple_dicts_to_lists( target1, target2, argument)
def dict_keys_to_string(vector_dict): vector_dict_copy = calculation.create_duplicates(vector_dict) keys = '' for word in vector_dict_copy.keys(): keys += str(word) + ' ' return keys
def dict_to_json(vector_dict): vector_dict_copy = calculation.create_duplicates(vector_dict) string_dict = {} for word in vector_dict_copy: string_dict[word] = str(list(vector_dict_copy[word])) return string_dict
def bias_analogy_test(target_set1, target_set2, attribute_set1, attribute_set2): logging.info("BAT: Calculation started") target1, target2, attribute1, attribute2 = calculation.create_duplicates(target_set1, target_set2, attribute_set1, attribute_set2) counter = 0 vocab = {} vectors = [] target_1 = [] target_2 = [] attributes_1 = [] attributes_2 = [] for word in target1: vocab[word] = counter counter += 1 target_1.append(word) vectors.append(np.array(list(target1[word]))) for word in target2: vocab[word] = counter counter += 1 target_2.append(word) vectors.append(np.array(list(target2[word]))) for word in attribute1: vocab[word] = counter counter += 1 attributes_1.append(word) vectors.append(np.array(list(attribute1[word]))) for word in attribute2: vocab[word] = counter counter += 1 attributes_2.append(word) vectors.append(np.array(list(attribute2[word]))) attributes_paired = [] for a1 in attributes_1: for a2 in attributes_2: attributes_paired.append((a1, a2)) temporary_vocab = list(set(target_1 + target_2 + attributes_1 + attributes_2)) dictionary_list = [] vector_matrix = [] for w in temporary_vocab: if w in vocab: vector_matrix.append(vectors[vocab[w]]) dictionary_list.append(w) vector_matrix = np.array(vector_matrix) vocab = {dictionary_list[i]: i for i in range(len(dictionary_list))} eq_pairs = [] for t1 in target_1: for t2 in target_2: eq_pairs.append((t1, t2)) for pair in eq_pairs: t1 = pair[0] t2 = pair[1] vec_t1 = vector_matrix[vocab[t1]] vec_t2 = vector_matrix[vocab[t2]] biased = [] totals = [] for a1, a2 in attributes_paired: vectors_a1 = vector_matrix[vocab[a1]] vectors_a2 = vector_matrix[vocab[a2]] diff_vec = vec_t1 - vec_t2 query_1 = diff_vec + vectors_a2 query_2 = vectors_a1 - diff_vec sims_q1 = np.sum(np.square(vector_matrix - query_1), axis=1) sorted_q1 = np.argsort(sims_q1) indices = np.where(sorted_q1 == vocab[a1])[0][0] other_attr_2 = [x for x in attributes_2 if x != a2] indices_other = [np.where(sorted_q1 == vocab[x])[0][0] for x in other_attr_2] number_biased = [x for x in indices_other if indices < x] biased.append(len(number_biased)) totals.append(len(indices_other)) sims_q2 = np.sum(np.square(vector_matrix - query_2), axis=1) sorted_q2 = np.argsort(sims_q2) indices = np.where(sorted_q2 == vocab[a2])[0][0] other_attr_1 = [x for x in attributes_1 if x != a1] indices_other = [np.where(sorted_q2 == vocab[x])[0][0] for x in other_attr_1] number_biased = [x for x in indices_other if indices < x] biased.append(len(number_biased)) totals.append(len(indices_other)) result = sum(biased) / sum(totals) logging.info("BAT: Calculated successfully, result: " + str(result)) return result