예제 #1
0
class CATTests(unittest.TestCase):
    def setUp(self) -> None:
        self.undertest = Vocab()
        self.tmp_dir = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "tmp")
        os.makedirs(self.tmp_dir, exist_ok=True)

    def tearDown(self) -> None:
        shutil.rmtree(self.tmp_dir)

    def test_add_words(self):
        self.undertest.add_words(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                         "examples", "vocab_data.txt"))
        self.assertEqual(["house", "dog"], list(self.undertest.vocab.keys()))

    def test_add_word(self):
        self.undertest.add_word("test", cnt=31, vec=[1.42, 1.44, 1.55])
        self.assertEqual(["test"], list(self.undertest.vocab.keys()))
        self.assertTrue("test" in self.undertest)

    def test_count(self):
        self.undertest.add_words(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                         "examples", "vocab_data.txt"))
        self.assertEqual(34444, self.undertest.count("house"))

    def test_save_and_load(self):
        self.undertest.add_words(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                         "examples", "vocab_data.txt"))
        self.undertest.add_word("test", cnt=31, vec=[1.42, 1.44, 1.55])
        vocab_path = f"{self.tmp_dir}/vocab.dat"
        self.undertest.save(vocab_path)
        vocab = Vocab.load(vocab_path)
        self.assertEqual(["house", "dog", "test"], list(vocab.vocab.keys()))
예제 #2
0
class MakeVocab(object):
    r'''
    Create a new vocab from a text file. To make a vocab and train word embeddings do:
    Args:
        cdb (medcat.cdb.CDB):
            The concept database that will be added ontop of the Vocab built from the text file.
        vocab (medcat.utils.vocab.Vocab, optional):
            Vocabulary to be extended, leave as None if you want to make a new Vocab. Default: None
        word_tokenizer (<function>):
            A custom tokenizer for word spliting - used if embeddings are BERT or similar.
            Default: None

    >>>cdb = <your existing cdb>
    >>>maker = MakeVocab(cdb=cdb, config=config)
    >>>maker.make(data_iterator, out_folder="./output/")
    >>>maker.add_vectors(in_path="./output/data.txt")
    >>>
    '''
    log = logging.getLogger(__name__)

    def __init__(self, config, cdb=None, vocab=None, word_tokenizer=None):
        self.cdb = cdb
        self.config = config
        self.w2v = None
        if vocab is not None:
            self.vocab = vocab
        else:
            self.vocab = Vocab()

        # Build the required spacy pipeline
        self.nlp = Pipe(tokenizer=spacy_split_all, config=config)
        self.nlp.add_tagger(tagger=tag_skip_and_punct,
                            name='skip_and_punct',
                            additional_fields=['is_punct'])

        # Get the tokenizer
        if word_tokenizer is not None:
            self.tokenizer = word_tokenizer
        else:
            self.tokenizer = self._tok

        # Used for saving if the real path is not set
        self.vocab_path = "./tmp_vocab.dat"

    def _tok(self, text):
        return [text]

    def make(self,
             iter_data,
             out_folder,
             join_cdb=True,
             normalize_tokens=False):
        r'''
        Make a vocab - without vectors initially. This will create two files in the out_folder:
        - vocab.dat -> The vocabulary without vectors
        - data.txt -> The tokenized dataset prepared for training of word2vec or similar embeddings.

        Args:
            iter_data (Iterator):
                An iterator over sentences or documents. Can also be a simple array of text documents/sentences.
            out_folder (string):
                A path to a folder where all the results will be saved
            join_cdb (bool):
                Should the words from the CDB be added to the Vocab. Default: True
            normalize_tokens (bool, defaults to True):
                If set tokens will be lematized - tends to work better in some cases where the difference
                between e.g. plural/singular should be ignored. But in general not so important if the dataset is big enough.
        '''
        # Save the preprocessed data, used for emb training
        out_path = Path(out_folder) / "data.txt"
        vocab_path = Path(out_folder) / "vocab.dat"
        self.vocab_path = vocab_path
        out = open(out_path, 'w', encoding='utf-8')

        for ind, doc in enumerate(iter_data):
            if ind % 10000 == 0:
                self.log.info("Vocab builder at: " + str(ind))
                print(ind)

            doc = self.nlp.nlp.tokenizer(doc)
            line = ""

            for token in doc:
                if token.is_space or token.is_punct:
                    continue

                if len(token.lower_) > 0:
                    if normalize_tokens:
                        self.vocab.inc_or_add(token._.norm)
                    else:
                        self.vocab.inc_or_add(token.lower_)

                if normalize_tokens:
                    line = line + " " + "_".join(token._.norm.split(" "))
                else:
                    line = line + " " + "_".join(token.lower_.split(" "))

            out.write(line.strip())
            out.write("\n")
        out.close()

        if join_cdb and self.cdb:
            for word in self.cdb.vocab.keys():
                if word not in self.vocab:
                    self.vocab.add_word(word)
                else:
                    # Update the count with the counts from the new dataset
                    self.cdb.vocab[word] += self.vocab[word]

        # Save the vocab also
        self.vocab.save(path=self.vocab_path)

    def add_vectors(self,
                    in_path=None,
                    w2v=None,
                    overwrite=False,
                    data_iter=None,
                    workers=14,
                    niter=2,
                    min_count=10,
                    window=10,
                    vsize=300,
                    unigram_table_size=100000000):
        r'''
        Add vectors to an existing vocabulary and save changes to the vocab_path.

        Args:
            in_path (String):
                Path to the data.txt that was created by the MakeVocab.make() function.
            w2v (Word2Vec, optional):
                An existing word2vec instance. Default: None
            overwrite (bool):
                If True it will overwrite existing vectors in the vocabulary. Default: False
            data_iter (iterator):
                If you want to provide a customer iterator over the data use this. If yes, then in_path is not needed.
            **: Word2Vec arguments

        Returns:
            A trained word2vec model.
        '''
        if w2v is None:
            if data_iter is None:
                data = SimpleIter(in_path)
            else:
                data = data_iter
            w2v = Word2Vec(data,
                           window=window,
                           min_count=min_count,
                           workers=workers,
                           size=vsize,
                           iter=niter)

        for word in w2v.wv.vocab.keys():
            if word in self.vocab:
                if overwrite:
                    self.vocab.add_vec(word, w2v.wv.get_vector(word))
                else:
                    if self.vocab.vec(word) is None:
                        self.vocab.add_vec(word, w2v.wv.get_vector(word))

        # Save the vocab again, now with vectors
        self.vocab.make_unigram_table(table_size=unigram_table_size)
        self.vocab.save(path=self.vocab_path)
        return w2v

    def destroy_pipe(self):
        self.nlp.destroy()