示例#1
0
def embedding_coherence_test(t1, t2, a1, a2):
    logging.info("Eval-Engine: ECT started")
    # Transform vector sets in lists
    attributes_dict = {}
    for word in a1:
        attributes_dict[word] = a1[word]
    for word in a2:
        attributes_dict[word] = a2[word]

    t1_list = calculation.transform_dict_to_list(t1)
    t2_list = calculation.transform_dict_to_list(t2)
    attributes = calculation.transform_dict_to_list(attributes_dict)
    # logging.info("ECT: Vector dictionaries/lists prepared successfully")
    mean_target_vector1 = target_set_mean_vector(t1_list)
    mean_target_vector2 = target_set_mean_vector(t2_list)
    # logging.info("ECT: Target set mean vectors calculated successfully")
    array_sim1 = []
    array_sim2 = []
    for i in range(len(attributes)):
        memory = attributes[i]
        cos_sim1 = calculation.cosine_similarity(mean_target_vector1, memory)
        array_sim1.append(cos_sim1)
        cos_sim2 = calculation.cosine_similarity(mean_target_vector2, memory)
        array_sim2.append(cos_sim2)
    value_array, p_value = spearmanr(array_sim1, array_sim2)
    logging.info("Eval-Engine: ECT-Scores: " + str(value_array) + "; p-value: " + str(p_value))
    return value_array, p_value
示例#2
0
def embedding_coherence_test(test_set1, test_set2, attribute_set):
    logging.info("ECT: Calculation started")
    # Create duplicates
    test1, test2, argument = calculation.create_duplicates(
        test_set1, test_set2, attribute_set)
    # Transform vector sets in lists
    test_list1 = calculation.transform_dict_to_list(test1)
    test_list2 = calculation.transform_dict_to_list(test2)
    arg_list = calculation.transform_dict_to_list(argument)
    logging.info("ECT: Vector dictionaries/lists prepared successfully")
    mean_target_vector1 = calculation.target_set_mean_vector(test_list1)
    mean_target_vector2 = calculation.target_set_mean_vector(test_list2)
    logging.info("ECT: Target set mean vectors calculated successfully")
    array_sim1 = []
    array_sim2 = []
    for i in range(len(arg_list)):
        memory = arg_list[i]
        cos_sim1 = calculation.cosine_similarity(mean_target_vector1, memory)
        array_sim1.append(cos_sim1)
        cos_sim2 = calculation.cosine_similarity(mean_target_vector2, memory)
        array_sim2.append(cos_sim2)
    value_array, p_value = spearmanr(array_sim1, array_sim2)
    logging.info("ECT: Calculated successfully:")
    logging.info("ECT: Results: " + str(value_array) + " p: " + str(p_value))
    return value_array, p_value
示例#3
0
def loss_function_ld(t1_list, t2_list, a_list):
    l_d = []
    for i in range(len(t1_list)):
        for j in range(len(t2_list)):
            for k in range(len(a_list)):
                value = (
                    calculation.cosine_similarity(t1_list[i], a_list[k]) -
                    calculation.cosine_similarity(t2_list[j], a_list[k])) ^ 2
                l_d.append(value)
示例#4
0
def association(target_word, attribute_list1, attribute_list2):
    minuend = []
    subtrahend = []
    for arg in attribute_list1:
        minuend.append(calculation.cosine_similarity(target_word, arg))
    for arg in attribute_list2:
        subtrahend.append(calculation.cosine_similarity(target_word, arg))
    minuend = sum(minuend) / len(attribute_list1)
    subtrahend = sum(subtrahend) / len(attribute_list2)
    return minuend - subtrahend
示例#5
0
def load_augment(word, sourcefile):
    fin = io.open(sourcefile,
                  'r',
                  encoding='utf-8',
                  newline='\n',
                  errors='ignore')
    database = 'fasttext'
    n, d = map(int, fin.readline().split())
    word_vec = database_handler.get_vector_from_database(word, database)
    sourcevector = []
    for word in word_vec:
        sourcevector = numpy.array(list(word_vec[word]))
    cosinesim = {}
    integer = 0
    running = 0
    for line in fin:
        tokens = line.rstrip().split(' ')
        vector = []
        for i in range(1, len(tokens)):
            vector.append(float(tokens[i]))
        if tokens[0] != word:
            cosinesim[tokens[0]] = calculation.cosine_similarity(
                sourcevector, numpy.array(vector))
    maximum1 = max(cosinesim, key=lambda k: cosinesim[k])
    cosinesim.pop(maximum1)
    maximum2 = max(cosinesim, key=lambda k: cosinesim[k])
    cosinesim.pop(maximum2)
    maximum3 = max(cosinesim, key=lambda k: cosinesim[k])
    cosinesim.pop(maximum3)
    maximum4 = max(cosinesim, key=lambda k: cosinesim[k])
    cosinesim.pop(maximum4)
    return [maximum1, maximum2, maximum3, maximum4]
示例#6
0
def compute_augmentations(target, vocab, vecs, iterations=4):
    augments = []
    cosinesim = {}
    if target not in vocab:
        return augments, True
    target_vec = np.array(vecs[vocab[target]])
    for word in vocab:
        vec = np.array(vecs[vocab[word]])
        if word != target:
            cosinesim[word] = calculation.cosine_similarity(target_vec, vec)
    for i in range(iterations):
        maximum = max(cosinesim, key=lambda k: cosinesim[k])
        cosinesim.pop(maximum)
        augments.append(maximum)
    return augments, False
示例#7
0
def word_for_nearest_vector(request_vector, database):
    print(datetime.datetime.now())
    conn = None
    running = 0
    target_word = ''
    maximum_vector = []
    maximum_cosine = 0.0
    try:
        conn = psycopg2.connect(dbname=database,
                                user=database_user,
                                host=database_host,
                                password=database_password)
        cur = conn.cursor()
        command = """SELECT vector FROM {} FETCH FIRST 10000 ONLY""".format(
            database)
        cur.execute(command)
        records = cur.fetchall()
        data = records[0]
        # cosine = []
        running += 1
        for string in data:
            running += 1
            tokens = str(string)[2:-3].split('  ')
            vector = []
            for i in range(len(tokens)):
                vector.append(float(tokens[i]))
            vector = numpy.array(vector)
            current_cosine = calculation.cosine_similarity(
                request_vector, vector)
            if current_cosine > maximum_cosine:
                maximum_vector = vector
                maximum_cosine = current_cosine
        command2 = """SELECT word FROM {} WHERE vector='{}'""".format(
            database, str(maximum_vector))
        cur.execute(command2)
        records2 = cur.fetchall()
        target_word = records2[0]
    except psycopg2.DatabaseError as error:
        logging.error("DB: Database error", error)
        print(error)
    finally:
        if conn is not None:
            conn.close()
    return target_word
示例#8
0
def load_multiple_augments(word_list, sourcefile):
    fin = io.open(sourcefile,
                  'r',
                  encoding='utf-8',
                  newline='\n',
                  errors='ignore')
    database = 'fasttext'
    n, d = map(int, fin.readline().split())
    source_dict = database_handler.get_multiple_vectors_from_db(
        word_list, database)
    source_dict2 = calculation.create_duplicates(source_dict)
    source_vectors = []
    for word in source_dict2:
        source_vectors.append(numpy.array(list(source_dict2[word])))
    augmentations = {}
    for i in range(len(source_dict)):
        cosinesim = {}
        for line in fin:
            tokens = line.rstrip().split(' ')
            vector = []
            for i in range(1, len(tokens)):
                vector.append(float(tokens[i]))
            if tokens[0] != source_dict[i]:
                cosinesim[tokens[0]] = calculation.cosine_similarity(
                    source_vectors[i], numpy.array(vector))
        maximum1 = max(cosinesim, key=lambda k: cosinesim[k])
        cosinesim.pop(maximum1)
        maximum2 = max(cosinesim, key=lambda k: cosinesim[k])
        cosinesim.pop(maximum2)
        maximum3 = max(cosinesim, key=lambda k: cosinesim[k])
        cosinesim.pop(maximum3)
        maximum4 = max(cosinesim, key=lambda k: cosinesim[k])
        cosinesim.pop(maximum4)
        augmentations[source_dict[i]] = [
            maximum1, maximum2, maximum3, maximum4
        ]
    return augmentations