Пример #1
0
    def clear_embedding_list(self,model, embedding_word_dict, words_dict):
        cleared_embedding_list = []
        cleared_embedding_word_dict = {}
        k,l = 0, 0
        if self.cfg.do_spellcheck_oov_words:
            def P(word):
                return - embedding_word_dict.get(word, 0)

            def correction(word):
                return max(candidates(word), key=P)

            def candidates(word):
                return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

            def known(words):
                "The subset of `words` that appear in the dictionary of WORDS."
                return set(w for w in words if w in embedding_word_dict)

            def edits1(word):
                "All edits that are one edit away from `word`."
                letters = 'abcdefghijklmnopqrstuvwxyz'
                splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
                deletes = [L + R[1:] for L, R in splits if R]
                transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
                replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
                inserts = [L + c + R for L, R in splits for c in letters]
                return set(deletes + transposes + replaces + inserts)

            def edits2(word):
                "All edits that are two edits away from `word`."
                return (e2 for e1 in edits1(word) for e2 in edits1(e1))

        for word in tqdm.tqdm(words_dict):
            if word not in embedding_word_dict:
                l += 1
                if self.cfg.do_spellcheck_oov_words:
                    corrected_word = correction(word)
                    if corrected_word in embedding_word_dict:
                        row = model[corrected_word]
                        cleared_embedding_list.append(row)
                        cleared_embedding_word_dict[word] = len(cleared_embedding_word_dict)
                if self.cfg.do_synthezize_embeddings:

                    row = get_oov_vector(word, model, threshold=self.cfg.synth_threshold)
                    if row is None:
                        k += 1
                        continue
                    else:
                        cleared_embedding_list.append(row)
                        cleared_embedding_word_dict[word] = len(cleared_embedding_word_dict)
            else:
                row = model[word]
                cleared_embedding_list.append(row)
                cleared_embedding_word_dict[word] = len(cleared_embedding_word_dict)
        print('embeddings not found: {0:.1f}%'.format(l / len(words_dict) * 100))
        print('embeddings not synthesized: {0:.1f}%'.format(k / len(words_dict) * 100))
        return cleared_embedding_list, cleared_embedding_word_dict
Пример #2
0
 def tokenized_sentences2seq(self,tokenized_sentences, words_dict):
     print('converting to sequence')
     sequences = []
     for sentence in tqdm.tqdm(tokenized_sentences, mininterval=5):
         seq = []
         for token in sentence:
             try:
                 seq.append(words_dict[token])
             except KeyError:
                 seq.append(words_dict[UNKNOWN_WORD])
         sequences.append(seq)
     return sequences
Пример #3
0
    def create_word2id(self, list_of_tokenized_sentences):
        print('CREATING VOCABULARY')
        for tokenized_sentences in list_of_tokenized_sentences:
            for tokens in tqdm.tqdm(tokenized_sentences):
                self.word_counter.update(tokens)

        raw_counts = self.word_counter.most_common(self.cfg.max_words)
        vocab = [char_tuple[0] for char_tuple in raw_counts]
        print('%s words detected, keeping %s words' % (len(self.word_counter), len(vocab)))
        self.word2id = {word: (ind + 1) for ind, word in enumerate(vocab)}
        self.word2id[UNKNOWN_WORD] = len(self.word2id)
        self.id2word = dict((id, word) for word, id in self.word2id.items())
        print('finished')
Пример #4
0
 def tokenize_sentences(self,sentences, mode = 'twitter'):
     twitter_tokenizer = TweetTokenizer()
     tokenized_sentences = []
     print('tokenizing sentences using %s' %mode)
     for sentence in tqdm.tqdm(sentences,mininterval=5):
         if hasattr(sentence, "decode"):
             sentence = sentence.decode("utf-8")
         #sentence = self.preprocessor.expand_contractions(sentence)
         if mode == 'nltk':
             tokens = nltk.tokenize.word_tokenize(sentence)
         elif mode == 'twitter':
             tokens = twitter_tokenizer.tokenize(sentence)
         else:
             tokens = None
         tokenized_sentences.append(tokens)
     return tokenized_sentences
Пример #5
0
 def clear_embedding_list_fasttext(self,model, words_dict):
     cleared_embedding_list = []
     cleared_embedding_word_dict = {}
     k = 0
     l = 0
     for word in tqdm.tqdm(words_dict):
         k+=1
         try:
             row = model[word]
             cleared_embedding_list.append(row)
             cleared_embedding_word_dict[word] = len(cleared_embedding_word_dict)
         except KeyError:
             l +=1
             continue
     print('embeddings not found: {0:.1f}%'.format(l / len(words_dict) * 100))
     print('embeddings not synthesized: {0:.1f}%'.format(k / len(words_dict) * 100))
     return cleared_embedding_list, cleared_embedding_word_dict
Пример #6
0
    def convert_tokens_to_ids(self,tokenized_sentences, embedding_word_dict):
        words_train = []
        'converting word index to embedding index'
        for sentence in tqdm.tqdm(tokenized_sentences):
            current_words = []
            for word_index in sentence:
                try:
                    word = self.id2word[word_index]
                    word_id = embedding_word_dict.get(word, len(embedding_word_dict) - 2)
                except KeyError:
                    word_id = embedding_word_dict.get(UNKNOWN_WORD, len(embedding_word_dict) - 2)
                current_words.append(word_id)

            if len(current_words) >= self.cfg.max_seq_len:
                current_words = current_words[:self.cfg.max_seq_len]
            else:
                current_words += [len(embedding_word_dict) - 1] * (self.cfg.max_seq_len - len(current_words))
            words_train.append(current_words)
        return words_train