Exemplo n.º 1
0
def wordsimi353(embeddingDict, average_emb, rand, dim, ML):
    #embeddingDict=loadEmbeddingFile('/mounts/data/proj/wenpeng/Emb_Extend/results_L2norm_forall/conc_0_1_2_3_4.txt')
    #embeddingDict=loadEmbeddingFile('/mounts/data/proj/wenpeng/Tensor/result/tensor_ppmi/word2embedding_giga_wiki_20141201_300d.txt')
    #embeddingDict=loadEmbeddingFile(filename)
    #emb_matrix=mat(embeddingDict.values())
    #norm_vector=LA.norm(emb_matrix, ord=2,axis=0)
    #emb_normalized=numpy.divide(emb_matrix,norm_vector)
    #key_list=embeddingDict.keys()
    wordPairs2label = load_wordsimi353()
    wordPair2simi = {}
    labels = []
    predicts = []
    unknown = 0
    for (word1, word2), label in wordPairs2label.iteritems():
        #labels.append(label)
        embedding1 = embeddingDict.get(word1)
        embedding2 = embeddingDict.get(word2)
        if embedding1 is not None and embedding2 is not None:
            predict = dot_prod(embedding1, embedding2)
            wordPair2simi[(word1, word2)] = predict
            predicts.append(predict)
            labels.append(label)
        else:
            if not ML:
                if embedding1 is None:
                    if rand:
                        vector = numpy.random.uniform(-1, 1, dim)
                        embedding1 = list(vector / numpy.linalg.norm(vector))
                    else:
                        embedding1 = average_emb
                if embedding2 is None:
                    if rand:
                        vector = numpy.random.uniform(-1, 1, dim)
                        embedding2 = list(vector / numpy.linalg.norm(vector))
                    else:
                        embedding2 = average_emb
                predict = dot_prod(embedding1, embedding2)
                wordPair2simi[(word1, word2)] = predict
                predicts.append(predict)
                labels.append(label)

            unknown += 1
    sp, p_value = spearmanr(labels, predicts)
    print 'Spearmanr is: ' + str(sp) + ', unknown pairs: ' + str(unknown)
def wordsimi353(embeddingDict,average_emb,rand,dim,ML):
    #embeddingDict=loadEmbeddingFile('/mounts/data/proj/wenpeng/Emb_Extend/results_L2norm_forall/conc_0_1_2_3_4.txt')
    #embeddingDict=loadEmbeddingFile('/mounts/data/proj/wenpeng/Tensor/result/tensor_ppmi/word2embedding_giga_wiki_20141201_300d.txt')
    #embeddingDict=loadEmbeddingFile(filename)
    #emb_matrix=mat(embeddingDict.values())
    #norm_vector=LA.norm(emb_matrix, ord=2,axis=0)
    #emb_normalized=numpy.divide(emb_matrix,norm_vector)
    #key_list=embeddingDict.keys()
    wordPairs2label=load_wordsimi353()
    wordPair2simi={}
    labels=[]
    predicts=[]
    unknown=0
    for (word1, word2), label in wordPairs2label.iteritems():
        #labels.append(label)
        embedding1=embeddingDict.get(word1)
        embedding2=embeddingDict.get(word2)
        if embedding1 is not None and embedding2 is not None:            
            predict=dot_prod(embedding1, embedding2)
            wordPair2simi[(word1, word2)]=predict
            predicts.append(predict)
            labels.append(label)   
        else:
            if not ML:
                if embedding1 is None:
                    if rand:
                        vector=numpy.random.uniform(-1,1,dim) 
                        embedding1=list(vector / numpy.linalg.norm(vector))
                    else:
                        embedding1=average_emb
                if embedding2 is None:
                    if rand:
                        vector=numpy.random.uniform(-1,1,dim) 
                        embedding2=list(vector / numpy.linalg.norm(vector))
                    else:
                        embedding2=average_emb
                predict=dot_prod(embedding1, embedding2)
                wordPair2simi[(word1, word2)]=predict
                predicts.append(predict)
                labels.append(label)             
            
            unknown+=1
    sp,p_value=spearmanr(labels, predicts)
    print 'Spearmanr is: '+str(sp)+', unknown pairs: '+str(unknown)     
Exemplo n.º 3
0
def RW(embeddingDict, average_emb, rand, dim, ML):
    #embeddingDict=loadEmbeddingFile(filename)
    #embeddingDict=loadEmbeddingFile('/mounts/data/proj/wenpeng/Tensor/result/tensor_ppmi/word2embedding_giga_wiki_20141201_300d.txt')
    #embeddingDict=loadEmbeddingFile('/mounts/data/proj/wenpeng/Dataset/glove.42B.300d.txt') #hlbl-embeddings-original.EMBEDDING_SIZE=100.txt')
    wordPairs2label = load_RW()
    wordPair2simi = {}
    labels = []
    predicts = []
    unknown = 0
    for (word1, word2), label in wordPairs2label.iteritems():
        #labels.append(label)
        embedding1 = embeddingDict.get(word1)
        embedding2 = embeddingDict.get(word2)
        if embedding1 is not None and embedding2 is not None:
            predict = dot_prod(embedding1, embedding2)
            wordPair2simi[(word1, word2)] = predict
            predicts.append(predict)
            labels.append(label)
        else:
            if not ML:
                if embedding1 is None:
                    if rand:
                        vector = numpy.random.uniform(-1, 1, dim)
                        embedding1 = list(vector / numpy.linalg.norm(vector))
                    else:
                        embedding1 = average_emb
                if embedding2 is None:
                    if rand:
                        vector = numpy.random.uniform(-1, 1, dim)
                        embedding2 = list(vector / numpy.linalg.norm(vector))
                    else:
                        embedding2 = average_emb
                predict = dot_prod(embedding1, embedding2)
                wordPair2simi[(word1, word2)] = predict
                predicts.append(predict)
                labels.append(label)

            unknown += 1
    sp, p_value = spearmanr(labels, predicts)
    print 'Spearmanr is: ' + str(sp) + ', unknown pairs: ' + str(unknown)
def RW(embeddingDict,average_emb,rand,dim,ML):
    #embeddingDict=loadEmbeddingFile(filename)
    #embeddingDict=loadEmbeddingFile('/mounts/data/proj/wenpeng/Tensor/result/tensor_ppmi/word2embedding_giga_wiki_20141201_300d.txt')
    #embeddingDict=loadEmbeddingFile('/mounts/data/proj/wenpeng/Dataset/glove.42B.300d.txt') #hlbl-embeddings-original.EMBEDDING_SIZE=100.txt')
    wordPairs2label=load_RW()
    wordPair2simi={}
    labels=[]
    predicts=[]
    unknown=0
    for (word1, word2), label in wordPairs2label.iteritems():
        #labels.append(label)
        embedding1=embeddingDict.get(word1)
        embedding2=embeddingDict.get(word2)
        if embedding1 is not None and embedding2 is not None:
            predict=dot_prod(embedding1, embedding2)
            wordPair2simi[(word1, word2)]=predict
            predicts.append(predict)
            labels.append(label)   
        else:
            if not ML:
                if embedding1 is None:
                    if rand:
                        vector=numpy.random.uniform(-1,1,dim)   
                        embedding1=list(vector / numpy.linalg.norm(vector))
                    else:
                        embedding1=average_emb
                if embedding2 is None:
                    if rand:
                        vector=numpy.random.uniform(-1,1,dim) 
                        embedding2=list(vector / numpy.linalg.norm(vector))
                    else:
                        embedding2=average_emb
                predict=dot_prod(embedding1, embedding2)
                wordPair2simi[(word1, word2)]=predict
                predicts.append(predict)
                labels.append(label)             
            
            unknown+=1
    sp,p_value=spearmanr(labels, predicts)
    print 'Spearmanr is: '+str(sp)+', unknown pairs: '+str(unknown)