Exemplo n.º 1
0
 def __init__(self):
     self.infer_model, self.infer_sess = self._load_pretrained_model()
     self.tfidf_char_vectorizer = pickle.load(
         open("../dump/tfidf_char_vectorizer.pkl", "rb"))
     self.tfidf_word_vectorizer = pickle.load(
         open("../dump/tfidf_word_vectorizer_big.pkl", "rb"))
     self.processor = JamoProcessor()
     self.tokenizer = SentencePieceTokenizer(config)
Exemplo n.º 2
0
 def _build_vocab(self):
     count = Counter()
     processor = JamoProcessor()
     self.fasttext = FastText.load(self.pretrained_embed_dir)
     fname = os.listdir(self.base_dir)[0]
     with open(
             "/media/scatter/scatterdisk/reply_matching_model/sol.preprocessed_1.txt",
             "r") as f:
         for line in f:
             corpus_id, query, reply = line.strip().split("\t")
             count.update(self.tokenizer.tokenize(query))
             count.update(self.tokenizer.tokenize(reply))
     idx2word = [self.UNK_TOKEN, self.SOS_TOKEN, self.EOS_TOKEN] +  \
                 sorted([word for word, _ in count.most_common(self.vocab_size-3)])
     word2idx = {word: idx for idx, word in enumerate(idx2word)}
     return word2idx, idx2word
def get_embeddings(idx2word, config):
    embedding = np.random.uniform(-1 / 16, 1 / 16,
                                  [config.vocab_size, config.embed_dim])
    if config.pretrained_embed_dir:
        processor = JamoProcessor()
        ft = FastText.load(config.pretrained_embed_dir)
        num_oov = 0
        for i, vocab in enumerate(idx2word):
            try:
                embedding[i, :] = ft.wv[processor.word_to_jamo(vocab)]
            except:
                num_oov += 1
        print("Pre-trained embedding loaded. Number of OOV : {} / {}".format(
            num_oov, len(idx2word)))
    else:
        print(
            "No pre-trained embedding found, initialize with random distribution"
        )
    return embedding
Exemplo n.º 4
0
def get_embeddings(vocab_list_dir,
                   pretrained_embed_dir,
                   vocab_size,
                   embed_dim):
    embedding = np.random.uniform(-1/16, 1/16, [vocab_size, embed_dim])
    if os.path.isfile(pretrained_embed_dir) & os.path.isfile(vocab_list_dir):
        with open(vocab_list_dir, "r", encoding="utf-8") as f:
            vocab_list = [word.strip() for word in f if len(word)>0]
        processor = JamoProcessor()
        ft = FastText.load(pretrained_embed_dir)
        num_oov = 0
        for i, vocab in enumerate(vocab_list):
            try:
                embedding[i, :] = ft.wv[processor.word_to_jamo(vocab)]
            except:
                num_oov += 1
        print("Pre-trained embedding loaded. Number of OOV : {} / {}".format(num_oov, len(vocab_list)))
    else:
        print("No pre-trained embedding found, initialize with random distribution")
    return embedding
Exemplo n.º 5
0
 def __init__(self):
     self.tfidf_vectorizer = None
     self.jamo_processor = JamoProcessor()
Exemplo n.º 6
0
sys.path.append("/home/angrypark/korean-text-matching-tf")

from data_loader import DataGenerator
from trainer import MatchingModelTrainer
from preprocessor import DynamicPreprocessor
from utils.dirs import create_dirs
from utils.logger import SummaryWriter
from utils.config import load_config, save_config
from models.base import get_model
from utils.utils import JamoProcessor
from text.tokenizers import SentencePieceTokenizer

Config = namedtuple("config", ["sent_piece_model"])
config = Config("/media/scatter/scatterdisk/tokenizer/sent_piece.100K.model")
processor = JamoProcessor()
tokenizer = SentencePieceTokenizer(config)


def my_word_tokenizer(raw,
                      pos=["Noun", "Alpha", "Verb", "Number"],
                      stopword=[]):
    return [word for word in tokenizer.tokenize(raw)]


def my_char_tokenizer(raw,
                      pos=["Noun", "Alpha", "Verb", "Number"],
                      stopword=[]):
    return [processor.word_to_jamo(word) for word in tokenizer.tokenize(raw)]