示例#1
0
    def update(self, corpus):
        self.corpus = [[word.lower() for word in sent] for sent in corpus]

        ## vocabulary
        vocab = list(set(flatten(corpus)))
        vocab.append('<UNK>')  # Yes we have unknow word!
        self.vocab = vocab
        word2idx = {'<UNK>': 0}

        for w in vocab:
            if word2idx.get(w) is None:
                word2idx[w] = len(word2idx)

        self.word2idx = word2idx
        self.idx2word = {v: k for k, v in word2idx.items()}

        windows_word_pair = flatten(
            list(
                nltk.ngrams(['<DUMMY>'] * self.ws + center_word +
                            ['<DUMMY>'] * self.ws, self.ws * 2 + 1)
                for center_word in self.corpus))

        train_x, train_y = [], []

        for data in windows_word_pair:
            for w in range(self.ws * 2 + 1):
                if w == data[self.ws] or data[w] == '<DUMMY>':
                    continue
                train_x.append(prepare_word(data[self.ws], self.word2idx))
                train_y.append(prepare_word(data[w], self.word2idx))
        ds = [(x, y) for x, y in zip(train_x, train_y)]
        self.ds = ds
        return self.ds
示例#2
0
    def word_similarity(self,
                        target: list,
                        vocab: list,
                        word2index: dict,
                        top_rank: int = 10):
        if USE_CUDA is True:
            target_V = self.model.prediction(prepare_word(target, word2index))
        else:
            target_V = self.model.prediction(prepare_word(target, word2index))
        similarities = []
        for i in range(len(vocab)):
            if vocab[i] == target:
                continue

            if USE_CUDA:
                vector = self.model.prediction(
                    prepare_word(list(vocab)[i], word2index))
            else:
                vector = self.model.prediction(
                    prepare_word(list(vocab)[i], word2index))
            consine_sim = F.cosine_similarity(target_V,
                                              vector).data.tolist()[0]
            similarities.append([vocab[i], consine_sim])
        return sorted(similarities, key=lambda x: x[1],
                      reverse=True)[:top_rank]
示例#3
0
    def prepare_train_data_method(self,
                                  window_data: List = [],
                                  word2index: dict = {},
                                  weighting_dic: dict = {},
                                  X_ik: dict = {}):
        u_p = []
        v_p = []
        co_p = []
        weight_p = []
        # Reference
        # view
        #    http://pytorch.org/docs/master/tensors.html#torch.Tensor.view
        for pair in window_data:
            u_p.append(prepare_word(pair[0], word2index).view(1, -1))
            v_p.append(prepare_word(pair[1], word2index).view(1, -1))
            try:
                cooc = X_ik[pair]
            except:
                cooc = 1

            co_p.append(torch.log(Variable(FloatTensor([cooc]))).view(1, -1))
            weight_p.append(
                Variable(FloatTensor([weighting_dic[pair]])).view(1, -1))

        train_data = list(zip(u_p, v_p, co_p, weight_p))
        return train_data
示例#4
0
def most_similiar(model, word, vocab, word2idx, topk):
    vec = model.predict(prepare_word(word, word2idx))

    similarity = []

    for i in range(len(vocab)):
        if vocab[i] == word:
            continue

        vec_test = model.predict(prepare_word(list(vocab)[i], word2idx))

        cos_distance = F.cosine_similarity(vec, vec_test).item()[0]
        similarity.append(cos_distance)

    return sorted(similarity, key=lambda x: x[1], reverse=True)[:topk]
    def get_all_words(self):
        words = defaultdict(lambda: 0)
        print 'delimiters: {}'.format(TITLE_DELIMITER)
        videos = self.db_handler.get_all_videos()
        for video in videos:
            for word in extract_words(video.title):
                words[prepare_word(word)] += 1

        return words
示例#6
0
 def classify_predict(self, word: str, classify_model_name: str, word2index: list):
     if word not in word2index:
         return self.other_classes
     vector = self.model.prediction(prepare_word(word,
                                                 word2index))
     vector = self.__transfer_vector(vector)
     classifier = joblib.load(classify_model_name)
     classes = classifier.predict(vector)
     return classes[0]
    def fill_words_for_videos(self):
        words = self.db_handler.db_session.query(Word).all()
        word_dict = {}
        for word in words:
            word_dict[word.word] = word

        videos = self.db_handler.get_all_videos()
        for video in videos:
            wordids = set()
            for word in extract_words(video.title):
                w = prepare_word(word)
                if w in word_dict:
                    wordids.add(word_dict[w].id)
            video.wordids = serialize_ids(wordids)

        self.db_handler.commit()