Пример #1
0
def setCentroidsFromLabel(file1, file2, file3, max_len):
    cluster1 = []
    cluster2 = []
    cluster3 = []
    try:
        data_one = open(file1, "r", encoding='utf-8')
        data_two = open(file2, 'r', encoding='utf-8')
        data_three = open(file3, 'r', encoding='utf-8')
    except Exception as e:
        print(e)
    else:
        SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
        for line in data_one:
            matrix = SE.map_sentence((line.lower()), max_len=max_len)
            cluster1.append(matrix)
        with data_two:
            for line in data_two:
                matrix = SE.map_sentence((line.lower()), max_len=max_len)
                cluster2.append(matrix)
        with data_three:
            for line in data_three:
                matrix = SE.map_sentence((line.lower()), max_len=max_len)
                cluster3.append(matrix)
        #Set centroids
        centroid1 = (1 / len(cluster1)) * np.sum(cluster1, axis=0)
        centroid2 = (1 / len(cluster2)) * np.sum(cluster2, axis=0)
        centroid3 = (1 / len(cluster3)) * np.sum(cluster3, axis=0)
    return centroid1, centroid2, centroid3
Пример #2
0
def max_len_three(file1, file2, file3):
    try:
        data_one = open(file1, "r", encoding='utf-8')
        data_two = open(file2, 'r', encoding='utf-8')
        data_three = open(file3, 'r', encoding='utf-8')
    except Exception as e:
        print(e)
    else:
        max_len1 = 0
        max_len2 = 0
        max_len3 = 0
        SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
        with data_one:
            for line in data_one:
                matrix = SE.map_sentence(line.lower())
                tweet_len = len(matrix)
                if tweet_len > max_len1:
                    max_len1 = tweet_len
        with data_two:
            for line in data_two:
                matrix = SE.map_sentence(line.lower())
                tweet_len = len(matrix)
                if tweet_len > max_len2:
                    max_len2 = tweet_len
        with data_three:
            for line in data_three:
                matrix = SE.map_sentence(line.lower())
                tweet_len = len(matrix)
                if tweet_len > max_len3:
                    max_len3 = tweet_len
        return max(max_len1, max_len2, max_len3)
Пример #3
0
def get_max_len(file):
    max_len = 0
    SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
    for line in file:
        matrix = SE.map_sentence(line.lower())
        tweet_len = len(matrix)
        if tweet_len > max_len:
            max_len = tweet_len
    return max_len
Пример #4
0
def main():
    G = GloveEmbedding("../test/data/glove.6B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    #print("locon: ", word_to_idx["locon"])
    print("Length dictionary: ", len(word_to_idx))
    #s = "I love New York and music locon"
    s = "The flu is making me sad"
    s = s.lower()
    print("Sentence: ", s)
    S = SentenceToIndices(word_to_idx)
    sentence = S.map_sentence(s)
    print("Sentence to indices: ", sentence)
    print("Padded: ", PadSentences(10).pad(sentence))
    SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
    matrix1 = SE.map_sentence(s, max_len=len(s))

    s2 = "The flu is making me sad".lower()
    matrix2 = SE.map_sentence(s2, max_len=len(s2))

    print("Matrix 1: ", matrix1)
    print("Matrix.shape: ", matrix1.shape)
    print("\n Matrix 2: ", matrix2)
    print("Matrix.shape: ", matrix2.shape)

    print("\n Self Similarity: ", matrix_cosine_similary(matrix1, matrix1))

    M1 = np.array([-1, 40, 0.04]).reshape((3, 1))
    M2 = np.array([100, 2, 3]).reshape((3, 1))
    print("M1: \n ", M1)
    print("M2: \n", M2)
    SimM = matrix_cosine_similary(M1, M2)
    print("SimM: \n", SimM)
    D = distance_similarity_matrix(SimM)
    print("D: ", D)

    M3 = np.array([[1, 2, 3, 1], [4, 5, 6, 2], [7, 8, 9, 1]])
    M4 = np.array([[1, 2, 3.000001, 1], [4, 5, 6, 2], [7, 8, 9, 1]])

    SimM = matrix_cosine_similary(M3, M3)
    print("SimM: \n", SimM)
    D = distance_similarity_matrix(SimM)
    print("D: ", D)

    SimM = matrix_cosine_similary(M3, M4)
    print("\nSimM: \n", SimM)
    Up = np.triu(SimM)
    D = distance_similarity_matrix(SimM)
    print("D: ", D)
    print("Up: ", Up)
    print("sum Up: ", np.sum(Up))
    print("up I: ", np.triu(np.ones(Up.shape)))
    print("sum I: ", np.sum(np.triu(np.ones(Up.shape))))
Пример #5
0
def get_glove(glove_dims): # get glove embedding matrix
    if glove_dims == 50:
        G = GloveEmbedding(filename="../test/data/glove.twitter.27B.50d.txt", dimensions=50)
    elif glove_dims==200:
        G = GloveEmbedding(filename="../test/data/glove.twitter.27B.200d.txt", dimensions=200)
    elif glove_dims==300:
        G = GloveEmbedding(filename="../test/data/glove.840B.300d.txt", dimensions=300)
    else:
        print("Wrong Number of dimensions")
        exit(0)
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    #S = SentenceToIndices(word_to_idx)
    SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
    return SE
Пример #6
0
def getGlove():
    G = GloveEmbedding("../test/data/glove.twitter.27B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    S = SentenceToIndices(word_to_idx)
    SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
    return SE
Пример #7
0
def main():
    G = GloveEmbedding("data/glove.6B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    print("embedding shape: ", embedding.shape)
    print("idx hello: ", word_to_idx["hello"])
    print("word 20: ", idx_to_word[20])
    e = embedding[word_to_idx["hello"]]
    print("embedding hello: ", e)
    print("e.shape: ", e.shape)
    print("<UNK>: ", word_to_idx['<unk>'])
    print("embedding: <UNK>: ", embedding[word_to_idx['<unk>']])

    you = embedding[word_to_idx['you']]
    he = embedding[word_to_idx['he']]
    ise = embedding[word_to_idx['is']]
    crazy = embedding[word_to_idx['crazy']]
    nuts =  embedding[word_to_idx['nuts']]

    print("embedding of you: ", you)
    print("embedding of he: ", he)
    print("embedding of ise: ", ise)
    print("embedding of crazy: ", crazy)
    print("embedding of nuts: ", nuts)

    tweet1 = "You are crazy"
    tweet2 = "You are nuts"
    tweet3 = "He is crazy"
    tweet4 = "You are lazy"
    tweet5 = "You are crazy man"
    tweet6 = "Yes You are crazy"
    tweet7 = "The fast train"

    mapper = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
    emb1 = mapper.map_sentence(tweet1.lower(), 4)
    emb2  = mapper.map_sentence(tweet2.lower(), 4)
    emb3  = mapper.map_sentence(tweet3.lower(), 4)
    emb4  = mapper.map_sentence(tweet4.lower(), 4)
    emb5  = mapper.map_sentence(tweet5.lower(), 4)
    emb6  = mapper.map_sentence(tweet6.lower(), 4)
    emb7  = mapper.map_sentence(tweet7.lower(), 4)


    print("Distance tweet1 vs tweet2: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb2))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb2))
    print("Distance tweet1 vs tweet3: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb3))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb3))

    print("Distance tweet2 vs tweet3: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb2, emb3))
    print("Cos Tri: ", sim.TriUL_sim(emb2, emb3))

    print("Distance tweet1 vs tweet4: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb4))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb4))

    print("Distance tweet1 vs tweet5: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb5))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb5))

    print("Distance tweet1 vs tweet6: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb6))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb6))

    print("Distance tweet1 vs tweet7: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb7))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb7))


    print("Embedding tweet1: ")
    print(emb1)
    print("Embedding tweet6: ")
    print(emb6)
Пример #8
0
    ans3 = (np.array(c3)- np.array(oldc3) < EM ).all()
    print("Epsilon 3: ", ans3)

    if ans1 and ans2 and ans3:
        return True
    else:
        return False


if __name__ == "__main__":
    #Step 1: Set Centroids
    print("Step 1: Starting")
    G = GloveEmbedding("../test/data/glove.twitter.27B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    S = SentenceToIndices(word_to_idx)
    SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
    data = []
    dictionary1  = {}
    dictionary2  = {}
    try:
        datafile = open("data/small_tweets.txt", "r", encoding='utf-8')
        with datafile as f:
            for line in f:
                newline = " ".join(line.split())
                data.append(newline)
    except Exception as e:
        print(e)
    max_len = get_max_len(data)
    finaldata = []
    for line in data:
        emb = SE.map_sentence(line.lower(), max_len=max_len)