def get_prediction(self, sentence1, sentence2): sent1 = gutils.padd_fn( gutils.get_tokens( sentence1.encode("ascii", errors="ignore").decode())) sent2 = gutils.padd_fn( gutils.get_tokens( sentence2.encode("ascii", errors="ignore").decode())) return self.model.predict([ gutils.get_wv_siamese(self.wv_model, [sent1]), gutils.get_wv_siamese(self.wv_model, [sent2]) ])[0][0]
def get_representations(self, sentences): sentences = [ gutils.padd_fn( gutils.get_tokens( sentence.encode("ascii", errors="ignore").decode())) for sentence in sentences ] return self.model.get_embeddings( gutils.get_wv_siamese(self.wv_model, sentences))
def create_sent_tokens_array(): try: tokens_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, cnt.SENT_TOKENS_FILE), mode='w') atom = tables.StringAtom(itemsize=16) tokens_arr = tokens_file.create_earray(tokens_file.root, 'data', atom, (0, cnt.MAX_WORDS)) vocab = set() n, batch_size = len(items), cnt.PYTABLES_INSERT_BATCH_SIZE num_batches = int(math.ceil(float(n)/batch_size)) for m in range(num_batches): start, end = m*batch_size, min((m+1)*batch_size, n) batch_items = [items[x] for x in range(start, end)] tokens = [gutils.padd_fn(gutils.get_tokens(gutils.get_item_text(item))) for item in batch_items] tokens_arr.append(tokens) vocab.update([x for token in tokens for x in token]) vocab = sorted(list(vocab)) word2idx_map = {w: i + 1 for i, w in enumerate(vocab)} gutils.save_data_pkl(word2idx_map, cnt.WORD2IDX_FILE) sent_tokens = tokens_file.root.data sents_arr_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, cnt.SENT_ARRAYS_FILE), mode='w') atom = tables.Int32Atom() sents_arr = sents_arr_file.create_earray(sents_arr_file.root, 'data', atom, (0, cnt.MAX_WORDS)) n, batch_size = len(items), cnt.PYTABLES_INSERT_BATCH_SIZE num_batches = int(math.ceil(float(n)/batch_size)) for m in range(num_batches): start, end = m*batch_size, min((m+1)*batch_size, n) tokens = [sent_tokens[x] for x in range(start, end)] sent_arrs = [[gutils.word_to_idx(w, word2idx_map) for w in token] for token in tokens] sents_arr.append(sent_arrs) finally: tokens_file.close() sents_arr_file.close()
def get_representation(self, sentence): sent = gutils.padd_fn( gutils.get_tokens( sentence.encode("ascii", errors="ignore").decode())) return self.model.get_embeddings( gutils.get_wv_siamese(self.wv_model, [sent]))[0]