Python LoadEmbeddings примеры использования

Язык программирования: Python

Пространство имен/Пакет: fnc.utils.loadEmbeddings

Класс/Тип: LoadEmbeddings

Примеров на hotexamples.com: 5

Python LoadEmbeddings - 5 примеров найдено. Это лучшие примеры Python кода для fnc.utils.loadEmbeddings.LoadEmbeddings, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

LoadEmbeddings(5)

Основные методы

LoadEmbeddings (5)

Пример #1

Показать файл

    def avg_embed(self, train_data, body_dict, threshold):

        embeddings = LoadEmbeddings(filepath=self.embeddPath,
                                    data_path=self.embeddData,
                                    vocab_size=self.vocab_size,
                                    embedding_size=self.embedding_size)

        bodyText_list = list(body_dict.values())
        bodyIds_index = {k: index for index, k in enumerate(body_dict.keys())}

        bodyText_w = [sent2stokens_wostop(text) for text in bodyText_list]

        unrelated, related, y_true, y_pred = [], [], [], []

        for headline, bodyID, stance in train_data:
            headline_w = sent2stokens_wostop(headline)
            body_w = bodyText_w[bodyIds_index[bodyID]]

            sim = avg_embedding_similarity(embeddings, self.embedding_size,
                                           ' '.join(headline_w),
                                           ' '.join(body_w))

            unrelated, related, y_true, y_pred = create_lists(
                sim, stance, threshold, [unrelated, related, y_true, y_pred])

        print_results([unrelated, related, y_true, y_pred], self.model_type)

Пример #2

Показать файл

    def doc2vec_similarity_max(self, train_data, body_dict, threshold):
        '''
        :param
        train_data : a list of training samples of type ['headline', 'bodyID', 'stance']
        body_dict : a dictionary of values containing {bodyID:'bodyText'}
        threshold : used distinguish between similar and not similar
        '''
        # Load embeddings
        logging.info('Load embeddings: Vocab-Size: ' + str(self.vocab_size) +
                     ' Embedding size: ' + str(self.embedding_size))

        embeddings = LoadEmbeddings(filepath=self.embeddPath,
                                    data_path=self.embeddData,
                                    vocab_size=self.vocab_size,
                                    embedding_size=self.embedding_size)

        # Align body-text in workable format
        bodyText_list = list(body_dict.values())
        bodyIds_index = {k: index for index, k in enumerate(body_dict.keys())}

        unrelated, related, y_true, y_pred = [], [], [], []
        sentence_list = []

        for headline, bodyID, stance in train_data:
            logging.info("Headline: " + headline)
            score = 0
            bodyText = bodyText_list[bodyIds_index[bodyID]]
            sentence_list = text2sent(bodyText)
            # logging.info("Bodytext: " + bodyText)

            for sentence in sentence_list:
                #logging.info("Sentence: " + sentence)
                # compare both sentences - vectors not necessary, since this procedure works with text
                # note: avg_embeddings_similarity tokenizes and lemmatizes the sentences prior to calculation, so no pre-assessment is necessary (Sentence to tokens without stopwords)
                temp_score = avg_embedding_similarity(embeddings,
                                                      self.embedding_size,
                                                      headline, sentence)
                #logging.info("Similarity: " + str(temp_score))

                # store the highest similarity score
                score = max(score, temp_score)

            # asses headline - body as related or unrelated based on threshold, taken the highest similarity of sentences
            unrelated, related, y_true, y_pred = create_lists(
                score, stance, threshold, [unrelated, related, y_true, y_pred])

            # following lines just for manual cross-checks
            if score <= threshold:
                calculated_stance = "unrelated"
            else:
                calculated_stance = "related"

            logging.info(
                "Best score for this headline - sentence similarity: " +
                str(score))
            logging.info("Real/calculated stance: " + stance + " / " +
                         calculated_stance)

        print_results([unrelated, related, y_true, y_pred], self.model_type)

Пример #3

Показать файл

    def word_mover_distance_similarity(self, train_data, body_dict, threshold,
                                       type):
        '''
        :param
        train_data : a list of training samples of type ['headline', 'bodyID', 'stance']
        body_dict : a dictionary of values containing {bodyID:'bodyText'}
        threshold : used distinguish between similar and not similar
        type: sentence|wholeText: compute distance per sentence or with whole body text
        '''
        # Load embeddings
        #logging.info('Load embeddings: Vocab-Size: ' + str(self.vocab_size) + ' Embedding size: ' + str(self.embedding_size))

        embeddings = LoadEmbeddings(filepath=self.embeddPath,
                                    data_path=self.embeddData,
                                    vocab_size=self.vocab_size,
                                    embedding_size=self.embedding_size)

        # Align body-text in workable format
        bodyText_list = list(body_dict.values())
        bodyIds_index = dict(
            (k, index) for index, k in enumerate(list(body_dict.keys())))

        unrelated, related, y_true, y_pred = [], [], [], []
        sentence_list = []

        for headline, bodyID, stance in train_data:
            #logging.info("Headline: " + headline)

            distance = 99999
            bodyText = bodyText_list[bodyIds_index[bodyID]]
            sentence_list = text2sent(bodyText)
            #logging.info("Bodytext: " + bodyText)
            if type == "sentence":
                for sentence in sentence_list:
                    #logging.info("Sentence: " + sentence)
                    temp_distance = abs(
                        computeAverageWMD(embeddings, headline, sentence))

                    # store the lowest distance
                    distance = min(distance, temp_distance)

                    #Note: Distance is not normallized!!
            elif type == "wholeText":
                distance = abs(
                    computeAverageWMD(embeddings, headline, bodyText))

            unrelated, related, y_true, y_pred = create_lists_distance_based(
                distance, stance, threshold,
                [unrelated, related, y_true, y_pred])
            if distance <= threshold:
                calculated_stance = "related"
            else:
                calculated_stance = "unrelated"

            #logging.info("Best word_mover_distance for this headline - body combination: " + str(distance))
            #logging.info("Real/calculated stance: " + stance + " / " + calculated_stance)

        print_results_distance_based([unrelated, related, y_true, y_pred],
                                     self.model_type)

Пример #4

Показать файл

Файл: feature_engineering.py Проект: zhanghaoie/athene_system

 def load_embeddings(headlines, bodies):
     # embedding parameters:
     embedding_size = 300
     vocab_size = 3000000
     embeddPath = "%s/data/embeddings/google_news/GoogleNews-vectors-negative300.bin.gz" % (
         path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))
     embeddData = path.normpath("%s/data/" %
                                (path.dirname(path.abspath(embeddPath))))
     binary_val = True
     embeddings = LoadEmbeddings(filepath=embeddPath,
                                 data_path=embeddData,
                                 vocab_size=vocab_size,
                                 embedding_size=embedding_size,
                                 binary_val=binary_val)
     #     print('Loaded embeddings: Vocab-Size: ' + str(vocab_size) + ' \n Embedding size: ' + str(embedding_size))
     return embedding_size, embeddings

Пример #5

Показать файл

Файл: doc2vec.py Проект: anshiquanshu66/conll2019-snopes-experiments

                featureVec = np.add(featureVec, model.word2embedd(word))
            else:
                featureVec = np.add(featureVec, model.word2embedd(u"unknown"))

        if(nwords>0):
            featureVec = np.divide(featureVec, nwords)
        return featureVec

def avg_embedding_similarity(embeddings, embedding_size, sent1, sent2):
    #print("Calculating similarity for: " + sent1 + "\n and\n" + sent2)
    v1 = avg_feature_vector(sent1, model=embeddings, num_features=embedding_size)
    v2 = avg_feature_vector(sent2, model=embeddings, num_features=embedding_size)
    cosine_distance = spatial.distance.cosine(v1, v2)
    score =  1 - cosine_distance
    #print("Score = " + str(score))
    return score

if __name__ == "__main__":
    sent1 = "United States of America"
    sent2 = "USA"
    data_path = myConstants.BASE_DIR + "/data/embeddings"
        
    embeddPath = os.path.normpath("%s/google_news/GoogleNews-vectors-negative300.bin.gz" % (data_path))
    embeddData = os.path.normpath("%s/google_news/data/" % (data_path))
    vocab_size = 3000000
    embedding_size = 300
    
    embeddings = LoadEmbeddings(filepath=embeddPath, data_path=embeddData, vocab_size=vocab_size, embedding_size=embedding_size)
    score = avg_embedding_similarity(embeddings, embedding_size, sent1, sent2)
    print(score)