def get_one_sentence_vector(cls, tm, sentence): import fasttext tokens = fasttext.tokenize(sentence) if isinstance(tm, fasttext.FastText._FastText): result = torch.tensor([tm[t] for t in tokens]) elif isinstance(tm, torchnlp.word_to_vector.char_n_gram.CharNGram): result = torch.stack([tm[t] for t in tokens]) else: result = tm[tokens] return result
def tokenize(keyword): """ Tokenizes using default fasttext tokenizer. Args: keyword: Keyword string (can be multi-word phrase!). Returns: List of words (tokens) from the keyword. """ return fasttext.tokenize(keyword)
def is_similar_fasttext(str1: str, str2: str) -> Tuple[bool, float]: """[summary] TODO Arguments: str1 {str} -- [description] str2 {str} -- [description] Returns: Tuple[bool, float] -- [description] """ # !!модель весит 6.7GB!! kaz_ft_model = fasttext.load_model(path="./models/cc.kk.300.bin") str1_tok = fasttext.tokenize(text=str1) str2_tok = fasttext.tokenize(text=str2) ratio = Levenshtein.ratio(str1, str2) if ratio < SIMILARITY_THRESHOLD: return (False, ratio) else: return (True, ratio)
def get_nltk_vectors(self, texts: List[str]): # https://gist.github.com/japerk/1909413 from textblob import TextBlob sid = self.nltk_sid vsid = self.vader_sid pdict = self.pdict n_tokens_in = self.n_tokens_in rake = self.rake_nltk nltk_texts = [fasttext.tokenize(text) for text in texts] textblob_sentiments = [[sentiment.polarity, sentiment.subjectivity] for sentiment in [TextBlob(text).sentiment for text in texts]] textblob_sentiments = torch.tensor(textblob_sentiments).unsqueeze(1).expand(len(texts), n_tokens_in, 2) textblob_sentiments = textblob_sentiments.to(get_device()) mask = stack_and_pad_tensors(list(map(lambda x: torch.ones(len(x), dtype=int), nltk_texts)), n_tokens_in) mask = mask.to(get_device()) mask = self.is_mask_em(mask) has_digit = stack_and_pad_tensors( list(map(lambda x: torch.tensor([has_digits(str(t)) for t in x]), nltk_texts)), n_tokens_in) has_digit = has_digit.to(get_device()) has_digit = self.has_digit_em(has_digit) m = self.text_model nltk_emb = stack_and_pad_tensors([torch.tensor([m[t] for t in sent]) for sent in nltk_texts], n_tokens_in) # if t in m else np.zeros(m.vector_size) nltk_emb = nltk_emb.to(get_device()) sid_vec = torch.tensor([list(sid.polarity_scores(t).values()) for t in texts]) sid_vec = sid_vec.unsqueeze(1).expand(len(texts), n_tokens_in, sid_vec.size(1)) sid_vec = sid_vec.to(get_device()) vsid_vec = torch.tensor([list(vsid.polarity_scores(t).values()) for t in texts]) vsid_vec = vsid_vec.unsqueeze(1).expand(len(texts), n_tokens_in, vsid_vec.size(1)) vsid_vec = vsid_vec.to(get_device()) conlltags = [[ptags for ptags in nltk.tree2conlltags(ne_chunk(pos_tag(x)))] for x in nltk_texts] pos = stack_and_pad_tensors( list(map(lambda x: torch.tensor([pdict[tag.lower()] for token, tag, ne in x]), conlltags)), n_tokens_in) pos = pos.to(get_device()) pos_emb = self.tag_em(pos) ner = stack_and_pad_tensors( list(map(lambda x: torch.tensor([pdict[ne.lower().split("-")[-1]] for token, tag, ne in x]), conlltags)), n_tokens_in) ner = ner.to(get_device()) ner_emb = self.tag_em(ner) phrases = [get_rake_nltk_phrases(rake, t) for t in texts] key_wc_rake_nltk = [get_rake_nltk_wc(tokens, phr) for tokens, phr in zip(nltk_texts, phrases)] key_wc_rake_nltk = stack_and_pad_tensors(key_wc_rake_nltk, self.n_tokens_in) key_wc_rake_nltk = key_wc_rake_nltk.to(get_device()) nltk_rake_vectors = self.key_wc_rake_nltk(key_wc_rake_nltk) result = torch.cat([vsid_vec, nltk_emb, textblob_sentiments, pos_emb, ner_emb, nltk_rake_vectors, sid_vec, mask, has_digit], 2) result = result.to(get_device()) result = self.nltk_nn(result) return result
def gen_test_tokenize(self, kwargs): self.assertEqual(["asdf", "asdb"], fasttext.tokenize("asdf asdb")) self.assertEqual(["asdf"], fasttext.tokenize("asdf")) self.assertEqual([fasttext.EOS], fasttext.tokenize("\n")) self.assertEqual(["asdf", fasttext.EOS], fasttext.tokenize("asdf\n")) self.assertEqual([], fasttext.tokenize("")) self.assertEqual([], fasttext.tokenize(" ")) # An empty string is not a token (it's just whitespace) # So the minimum length must be 1 words = get_random_words(100, 1, 20) self.assertEqual(words, fasttext.tokenize(" ".join(words)))
def input_vectors(self, sentences: list): max_num_words = 4 tokens = [] batch_size = len(sentences) h, w = (self.dim, 1) #loop over the batches to tokenize the inputs for i in range(batch_size): #Tokenize words using default fasttext tokenizer, which creates tokens # by dividing splitting at word separating chars tokens.append(fasttext.tokenize(sentences[i])) #Create a matrix with batch_size batches, num token channels and 100x1 matrices to store the 100dim embeddings in_vector = np.zeros((batch_size, max_num_words, h, w)) #cycle over the tokens and get their vectors, reshape them to 100x1 and store in the corresponding #channel in the return variable #cycle over the entire batch for j in range(len(tokens)): #counter for tokens i = 0 #cycle over tokens for token in tokens[j]: #get the embedding for the single token vector = torch.tensor(self.ft[token].astype(np.double)) #reshape it to desired dims vector = vector.reshape(h, w) #Store it in the input vectors matrix in_vector[j][i] = vector #increment the position of the word index within the given sentence #if it goes over the max word size, cut i = i + 1 if (i == max_num_words): break #create a tensor object to return in_vector = torch.tensor(in_vector) return in_vector
def _process_variable(self, word): labels = [] words = [] variable_word = word.replace('{', '').replace('}', '') try: words = fasttext.tokenize( self.literals[variable_word].get().lower()) except: print(word, variable_word) for i in range(len(words)): label = BEGINNING + variable_word if i == 0 else INSIDE + variable_word labels.append(label) return words, labels
def get_fasttext_embeddings(x: List[str], ft=None, path: str = None): if ft is None: if path is None: raise Exception("Both path and ft can't be None") ft = fasttext.load_model(path) embeddings = [] for sentence in x: tokens = fasttext.tokenize(sentence) representation = [] for token in tokens: representation.append(ft[token]) embeddings.append(representation) embeddings = pad(embeddings, [0 for _ in range(100)], 32) return embeddings
def get_word_vector(data, model): t1 = time.time() print("Reading") with open(data, 'r') as f: tokens = tokenize(f.read()) t2 = time.time() print("Read TIME: " + str(t2 - t1)) print("Read NUM : " + str(len(tokens))) f = load_model(model) # This is not equivalent to piping the data into # print-word-vector, because the data is tokenized # first. t3 = time.time() i = 0 for t in tokens: f.get_word_vector(t) i += 1 if i % 10000 == 0: sys.stderr.write("\ri: " + str(float(i / len(tokens)))) sys.stderr.flush() t4 = time.time() print("\nVectoring: " + str(t4 - t3))
def preprocess_text_for_language_detection(text: str): """ Cleans the text as per fasttext requirements. The requirements can be found here: https://pypi.org/project/fasttext/ :text: str: text to clean :returns: str: cleaned text """ # fastText assumes UTF-8 encoded text text = str(text) # fastText is not aware of UTF-8 whitespace # Replace all white space with space text = white_space_pattern.sub(text, " ") # Tokenize text, per fastext function and rejoin tokens = tokenize(text) text = " ".join(tokens) n = len(tokens) # Remove white space char as it affects the model accuracy text = text.replace("</s>", "") return text.lower()
def encode(self, text): tokens = tokenize(text.lower().replace('\n', ' ') + '\n') return [ self.vocab[t] if t in self.vocab else self.vocab['[UNK]'] for t in tokens ]
def get_corpus_description(data): corpus = ''.join(data["text"]) corpus_tokens = fasttext.tokenize(corpus) print("raw number of tokens: %d" % len(corpus_tokens)) counts = Counter(corpus_tokens) print("raw number of disctinct tokens: %d" % len(counts)) print("#### running the spellchecker###") # the only spellcheck I'm doing is removing repeated letters checked_corpus = re.sub(r"(.)\1{2,}", r"\1\1", corpus) corpus_tokens = fasttext.tokenize(checked_corpus) print("raw number of tokens: %d" % len(corpus_tokens)) counts = Counter(corpus_tokens) print("raw number of disctinct tokens: %d" % len(counts)) print('#### removing html tags') html_free_corpus = re.sub( '<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\\u201c|\\u2019', '', checked_corpus) html_free_corpus_tokens = fasttext.tokenize(html_free_corpus) print("number of tokens: %d" % len(html_free_corpus_tokens)) html_counts = Counter(html_free_corpus_tokens) print("number of disctinct tokens: %d" % len(html_counts)) print('#### removing links') link_free_corpus = re.sub( r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", '', html_free_corpus) link_free_corpus_tokens = fasttext.tokenize(link_free_corpus) print("number of tokens: %d" % len(link_free_corpus_tokens)) link_counts = Counter(link_free_corpus_tokens) print("number of disctinct tokens: %d" % len(link_counts)) print("#### removing special symbols and numbers ###") # remove special characters and numbers clean_corpus = re.sub('[^A-Za-z\s]+', '', link_free_corpus) clean_corpus_tokens = fasttext.tokenize(clean_corpus) print("number of tokens: %d" % len(clean_corpus_tokens)) clean_counts = Counter(clean_corpus_tokens) print("number of disctinct tokens: %d" % len(clean_counts)) # only the special symbols dirty_corpus = re.sub('[A-Za-z\s]+', '', corpus) distinct_symbols = Counter(dirty_corpus) print("Number of distinct removed special symbols: %d" % len(distinct_symbols)) print("#### removing stop words ####") stop_words = set(stopwords.words('english')) stop_words = [re.sub('[^A-Za-z\s]+', '', word) for word in stop_words] print("number of stop words: %d" % len(stop_words)) corpus_wo_stop_words = [ token for token in clean_corpus_tokens if not token in stop_words ] counts_wo_stopwords = Counter(corpus_wo_stop_words) print("Number of tokens wo stopwords: %d" % len(corpus_wo_stop_words)) print("Number of distinct tokens: %d" % len(counts_wo_stopwords)) print("#### lemmatization ####") lemmatizer = WordNetLemmatizer() lemmatized_corpus = [ lemmatizer.lemmatize(x) for x in tqdm(corpus_wo_stop_words) ] counts_lemmatized = Counter(lemmatized_corpus) print("Number lemmatized tokens: %d" % len(lemmatized_corpus)) print("Number of distinct lemmatized tokens: %d" % len(counts_lemmatized)) return counts_lemmatized.most_common(25000)
def description_to_tensor(model, desc): return torch.stack([ torch.tensor(model.get_word_vector(w)) for w in fasttext.tokenize(desc) ])
def get_one_sentence_vector(self, m, text): vs = min(m.vector_size, 150) zeros = np.zeros(vs) result = [m[t][:150] if t in m else zeros for t in fasttext.tokenize(text)] return torch.tensor(result, dtype=float)