예제 #1
0
    def get_prediction(self, sentence1, sentence2):
        sent1 = gutils.padd_fn(
            gutils.get_tokens(
                sentence1.encode("ascii", errors="ignore").decode()))
        sent2 = gutils.padd_fn(
            gutils.get_tokens(
                sentence2.encode("ascii", errors="ignore").decode()))

        return self.model.predict([
            gutils.get_wv_siamese(self.wv_model, [sent1]),
            gutils.get_wv_siamese(self.wv_model, [sent2])
        ])[0][0]
예제 #2
0
 def get_representations(self, sentences):
     sentences = [
         gutils.padd_fn(
             gutils.get_tokens(
                 sentence.encode("ascii", errors="ignore").decode()))
         for sentence in sentences
     ]
     return self.model.get_embeddings(
         gutils.get_wv_siamese(self.wv_model, sentences))
예제 #3
0
def create_sent_tokens_array():
    try:
        tokens_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, cnt.SENT_TOKENS_FILE), mode='w')
        atom = tables.StringAtom(itemsize=16)
        tokens_arr = tokens_file.create_earray(tokens_file.root, 'data', atom, (0, cnt.MAX_WORDS))
        vocab = set()
        
        n, batch_size = len(items), cnt.PYTABLES_INSERT_BATCH_SIZE
        num_batches = int(math.ceil(float(n)/batch_size))

        for m in range(num_batches):
            start, end = m*batch_size, min((m+1)*batch_size, n)
            batch_items = [items[x] for x in range(start, end)]
            tokens = [gutils.padd_fn(gutils.get_tokens(gutils.get_item_text(item))) for item in batch_items]
            tokens_arr.append(tokens)
            vocab.update([x for token in tokens for x in token])
            
        vocab = sorted(list(vocab))
        word2idx_map = {w: i + 1 for i, w in enumerate(vocab)}
        gutils.save_data_pkl(word2idx_map, cnt.WORD2IDX_FILE)
        
        sent_tokens = tokens_file.root.data
        
        sents_arr_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, cnt.SENT_ARRAYS_FILE), mode='w')
        atom = tables.Int32Atom()
        sents_arr = sents_arr_file.create_earray(sents_arr_file.root, 'data', atom, (0, cnt.MAX_WORDS))
        
        n, batch_size = len(items), cnt.PYTABLES_INSERT_BATCH_SIZE
        num_batches = int(math.ceil(float(n)/batch_size))
        
        for m in range(num_batches):
            start, end = m*batch_size, min((m+1)*batch_size, n)
            tokens = [sent_tokens[x] for x in range(start, end)]
            sent_arrs = [[gutils.word_to_idx(w, word2idx_map) for w in token] for token in tokens]
            sents_arr.append(sent_arrs)
        
    finally:
        tokens_file.close()
        sents_arr_file.close()
예제 #4
0
 def get_representation(self, sentence):
     sent = gutils.padd_fn(
         gutils.get_tokens(
             sentence.encode("ascii", errors="ignore").decode()))
     return self.model.get_embeddings(
         gutils.get_wv_siamese(self.wv_model, [sent]))[0]