def update(self, corpus): self.corpus = [[word.lower() for word in sent] for sent in corpus] ## vocabulary vocab = list(set(flatten(corpus))) vocab.append('<UNK>') # Yes we have unknow word! self.vocab = vocab word2idx = {'<UNK>': 0} for w in vocab: if word2idx.get(w) is None: word2idx[w] = len(word2idx) self.word2idx = word2idx self.idx2word = {v: k for k, v in word2idx.items()} windows_word_pair = flatten( list( nltk.ngrams(['<DUMMY>'] * self.ws + center_word + ['<DUMMY>'] * self.ws, self.ws * 2 + 1) for center_word in self.corpus)) train_x, train_y = [], [] for data in windows_word_pair: for w in range(self.ws * 2 + 1): if w == data[self.ws] or data[w] == '<DUMMY>': continue train_x.append(prepare_word(data[self.ws], self.word2idx)) train_y.append(prepare_word(data[w], self.word2idx)) ds = [(x, y) for x, y in zip(train_x, train_y)] self.ds = ds return self.ds
def word_similarity(self, target: list, vocab: list, word2index: dict, top_rank: int = 10): if USE_CUDA is True: target_V = self.model.prediction(prepare_word(target, word2index)) else: target_V = self.model.prediction(prepare_word(target, word2index)) similarities = [] for i in range(len(vocab)): if vocab[i] == target: continue if USE_CUDA: vector = self.model.prediction( prepare_word(list(vocab)[i], word2index)) else: vector = self.model.prediction( prepare_word(list(vocab)[i], word2index)) consine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] similarities.append([vocab[i], consine_sim]) return sorted(similarities, key=lambda x: x[1], reverse=True)[:top_rank]
def prepare_train_data_method(self, window_data: List = [], word2index: dict = {}, weighting_dic: dict = {}, X_ik: dict = {}): u_p = [] v_p = [] co_p = [] weight_p = [] # Reference # view # http://pytorch.org/docs/master/tensors.html#torch.Tensor.view for pair in window_data: u_p.append(prepare_word(pair[0], word2index).view(1, -1)) v_p.append(prepare_word(pair[1], word2index).view(1, -1)) try: cooc = X_ik[pair] except: cooc = 1 co_p.append(torch.log(Variable(FloatTensor([cooc]))).view(1, -1)) weight_p.append( Variable(FloatTensor([weighting_dic[pair]])).view(1, -1)) train_data = list(zip(u_p, v_p, co_p, weight_p)) return train_data
def most_similiar(model, word, vocab, word2idx, topk): vec = model.predict(prepare_word(word, word2idx)) similarity = [] for i in range(len(vocab)): if vocab[i] == word: continue vec_test = model.predict(prepare_word(list(vocab)[i], word2idx)) cos_distance = F.cosine_similarity(vec, vec_test).item()[0] similarity.append(cos_distance) return sorted(similarity, key=lambda x: x[1], reverse=True)[:topk]
def get_all_words(self): words = defaultdict(lambda: 0) print 'delimiters: {}'.format(TITLE_DELIMITER) videos = self.db_handler.get_all_videos() for video in videos: for word in extract_words(video.title): words[prepare_word(word)] += 1 return words
def classify_predict(self, word: str, classify_model_name: str, word2index: list): if word not in word2index: return self.other_classes vector = self.model.prediction(prepare_word(word, word2index)) vector = self.__transfer_vector(vector) classifier = joblib.load(classify_model_name) classes = classifier.predict(vector) return classes[0]
def fill_words_for_videos(self): words = self.db_handler.db_session.query(Word).all() word_dict = {} for word in words: word_dict[word.word] = word videos = self.db_handler.get_all_videos() for video in videos: wordids = set() for word in extract_words(video.title): w = prepare_word(word) if w in word_dict: wordids.add(word_dict[w].id) video.wordids = serialize_ids(wordids) self.db_handler.commit()