def embedding_coherence_test(t1, t2, a1, a2): logging.info("Eval-Engine: ECT started") # Transform vector sets in lists attributes_dict = {} for word in a1: attributes_dict[word] = a1[word] for word in a2: attributes_dict[word] = a2[word] t1_list = calculation.transform_dict_to_list(t1) t2_list = calculation.transform_dict_to_list(t2) attributes = calculation.transform_dict_to_list(attributes_dict) # logging.info("ECT: Vector dictionaries/lists prepared successfully") mean_target_vector1 = target_set_mean_vector(t1_list) mean_target_vector2 = target_set_mean_vector(t2_list) # logging.info("ECT: Target set mean vectors calculated successfully") array_sim1 = [] array_sim2 = [] for i in range(len(attributes)): memory = attributes[i] cos_sim1 = calculation.cosine_similarity(mean_target_vector1, memory) array_sim1.append(cos_sim1) cos_sim2 = calculation.cosine_similarity(mean_target_vector2, memory) array_sim2.append(cos_sim2) value_array, p_value = spearmanr(array_sim1, array_sim2) logging.info("Eval-Engine: ECT-Scores: " + str(value_array) + "; p-value: " + str(p_value)) return value_array, p_value
def embedding_coherence_test(test_set1, test_set2, attribute_set): logging.info("ECT: Calculation started") # Create duplicates test1, test2, argument = calculation.create_duplicates( test_set1, test_set2, attribute_set) # Transform vector sets in lists test_list1 = calculation.transform_dict_to_list(test1) test_list2 = calculation.transform_dict_to_list(test2) arg_list = calculation.transform_dict_to_list(argument) logging.info("ECT: Vector dictionaries/lists prepared successfully") mean_target_vector1 = calculation.target_set_mean_vector(test_list1) mean_target_vector2 = calculation.target_set_mean_vector(test_list2) logging.info("ECT: Target set mean vectors calculated successfully") array_sim1 = [] array_sim2 = [] for i in range(len(arg_list)): memory = arg_list[i] cos_sim1 = calculation.cosine_similarity(mean_target_vector1, memory) array_sim1.append(cos_sim1) cos_sim2 = calculation.cosine_similarity(mean_target_vector2, memory) array_sim2.append(cos_sim2) value_array, p_value = spearmanr(array_sim1, array_sim2) logging.info("ECT: Calculated successfully:") logging.info("ECT: Results: " + str(value_array) + " p: " + str(p_value)) return value_array, p_value
def loss_function_ld(t1_list, t2_list, a_list): l_d = [] for i in range(len(t1_list)): for j in range(len(t2_list)): for k in range(len(a_list)): value = ( calculation.cosine_similarity(t1_list[i], a_list[k]) - calculation.cosine_similarity(t2_list[j], a_list[k])) ^ 2 l_d.append(value)
def association(target_word, attribute_list1, attribute_list2): minuend = [] subtrahend = [] for arg in attribute_list1: minuend.append(calculation.cosine_similarity(target_word, arg)) for arg in attribute_list2: subtrahend.append(calculation.cosine_similarity(target_word, arg)) minuend = sum(minuend) / len(attribute_list1) subtrahend = sum(subtrahend) / len(attribute_list2) return minuend - subtrahend
def load_augment(word, sourcefile): fin = io.open(sourcefile, 'r', encoding='utf-8', newline='\n', errors='ignore') database = 'fasttext' n, d = map(int, fin.readline().split()) word_vec = database_handler.get_vector_from_database(word, database) sourcevector = [] for word in word_vec: sourcevector = numpy.array(list(word_vec[word])) cosinesim = {} integer = 0 running = 0 for line in fin: tokens = line.rstrip().split(' ') vector = [] for i in range(1, len(tokens)): vector.append(float(tokens[i])) if tokens[0] != word: cosinesim[tokens[0]] = calculation.cosine_similarity( sourcevector, numpy.array(vector)) maximum1 = max(cosinesim, key=lambda k: cosinesim[k]) cosinesim.pop(maximum1) maximum2 = max(cosinesim, key=lambda k: cosinesim[k]) cosinesim.pop(maximum2) maximum3 = max(cosinesim, key=lambda k: cosinesim[k]) cosinesim.pop(maximum3) maximum4 = max(cosinesim, key=lambda k: cosinesim[k]) cosinesim.pop(maximum4) return [maximum1, maximum2, maximum3, maximum4]
def compute_augmentations(target, vocab, vecs, iterations=4): augments = [] cosinesim = {} if target not in vocab: return augments, True target_vec = np.array(vecs[vocab[target]]) for word in vocab: vec = np.array(vecs[vocab[word]]) if word != target: cosinesim[word] = calculation.cosine_similarity(target_vec, vec) for i in range(iterations): maximum = max(cosinesim, key=lambda k: cosinesim[k]) cosinesim.pop(maximum) augments.append(maximum) return augments, False
def word_for_nearest_vector(request_vector, database): print(datetime.datetime.now()) conn = None running = 0 target_word = '' maximum_vector = [] maximum_cosine = 0.0 try: conn = psycopg2.connect(dbname=database, user=database_user, host=database_host, password=database_password) cur = conn.cursor() command = """SELECT vector FROM {} FETCH FIRST 10000 ONLY""".format( database) cur.execute(command) records = cur.fetchall() data = records[0] # cosine = [] running += 1 for string in data: running += 1 tokens = str(string)[2:-3].split(' ') vector = [] for i in range(len(tokens)): vector.append(float(tokens[i])) vector = numpy.array(vector) current_cosine = calculation.cosine_similarity( request_vector, vector) if current_cosine > maximum_cosine: maximum_vector = vector maximum_cosine = current_cosine command2 = """SELECT word FROM {} WHERE vector='{}'""".format( database, str(maximum_vector)) cur.execute(command2) records2 = cur.fetchall() target_word = records2[0] except psycopg2.DatabaseError as error: logging.error("DB: Database error", error) print(error) finally: if conn is not None: conn.close() return target_word
def load_multiple_augments(word_list, sourcefile): fin = io.open(sourcefile, 'r', encoding='utf-8', newline='\n', errors='ignore') database = 'fasttext' n, d = map(int, fin.readline().split()) source_dict = database_handler.get_multiple_vectors_from_db( word_list, database) source_dict2 = calculation.create_duplicates(source_dict) source_vectors = [] for word in source_dict2: source_vectors.append(numpy.array(list(source_dict2[word]))) augmentations = {} for i in range(len(source_dict)): cosinesim = {} for line in fin: tokens = line.rstrip().split(' ') vector = [] for i in range(1, len(tokens)): vector.append(float(tokens[i])) if tokens[0] != source_dict[i]: cosinesim[tokens[0]] = calculation.cosine_similarity( source_vectors[i], numpy.array(vector)) maximum1 = max(cosinesim, key=lambda k: cosinesim[k]) cosinesim.pop(maximum1) maximum2 = max(cosinesim, key=lambda k: cosinesim[k]) cosinesim.pop(maximum2) maximum3 = max(cosinesim, key=lambda k: cosinesim[k]) cosinesim.pop(maximum3) maximum4 = max(cosinesim, key=lambda k: cosinesim[k]) cosinesim.pop(maximum4) augmentations[source_dict[i]] = [ maximum1, maximum2, maximum3, maximum4 ] return augmentations