class fasttext_embedding: __dl = downloader() __w2v = None def __init__(self): self.__dl.download('fasttext_w2v', sbnltk_default.sbnltk_root_path + 'model/') self.__w2v = fasttext.load_model(sbnltk_default.sbnltk_root_path + 'model/fasttext_w2v.model') def get_vector(self, word): try: return self.__w2v[word] except: raise ValueError('Sorry!! Word is not exist in vocab!!') def get_nearest_neighbors(self, word, n=5): return self.__w2v.get_nearest_neighbors(word, k=n) def cosine_distance(self, word1, word2): if word1 == word2: return 1.0 try: vec1 = self.__w2v[word1] except: raise ValueError('Sorry!! 1st word is not exist in vocab!!') try: vec2 = self.__w2v[word2] except: raise ValueError('Sorry!! 2nd word is not exist in vocab!!') return (1.0 - spatial.distance.cosine(vec1, vec2))
class static_postag: __dl=downloader() __dict={} __stemmer=None __bp=preprocessor() __tokenizer=wordTokenizer() def __init__(self): self.__dl.download('postag_static',sbnltk_default.sbnltk_root_path+'dataset/') self.__stemmer=stemmerOP() path=sbnltk_default.sbnltk_root_path+'dataset/postag_static.txt' for word in open(path,'r'): word=word.replace('\n','') tokens=self.__tokenizer.basic_tokenizer(word) wd=tokens[0] val=tokens[-1] self.__dict[wd]=val def tag(self,sent): tokens=self.__tokenizer.basic_tokenizer(sent) ans=[] for word in tokens: if self.__bp.is_number(word): ans.append((word,'NUM')) continue if self.__dict.get(word): ans.append((word,self.__dict[word])) continue if self.__dict.get(self.__bp.word_normalize(word)) : ans.append((word, self.__dict[self.__bp.word_normalize(word)])) continue stem_word=self.__stemmer.stemWord(word) if self.__dict.get(stem_word): ans.append((word,self.__dict[stem_word])) continue ans.append((word,'unk')) return ans
class Bangla_sentence_embedding_gd: __dl = downloader() __model = None def __init__(self): if os.path.exists(sbnltk_default.sbnltk_root_path + 'model/Towhid-Sust-transformer') == False: self.__dl.download('sentence_embedding_transformer_gd', sbnltk_default.sbnltk_root_path + 'model/') with zipfile.ZipFile( sbnltk_default.sbnltk_root_path + 'model/sentence_embedding_transformer_gd.zip', 'r') as file: file.extractall(sbnltk_default.sbnltk_root_path + 'model/') os.remove(sbnltk_default.sbnltk_root_path + 'model/sentence_embedding_transformer_gd.zip') self.__model = SentenceTransformer(sbnltk_default.sbnltk_root_path + 'model/Towhid-Sust-transformer') def encode_sentence_list(self, sentences): embeddings = {} sentence_embeddings = self.__model.encode(sentences) for sentence, embedding in zip(sentences, sentence_embeddings): embeddings[sentence] = embedding return embeddings def encode_single_sentence(self, sentence): return self.__model.encode(sentence) def similarity_of_two_sentence(self, sentence1, sentence2): embed = self.encode_sentence_list([sentence1, sentence2]) return util.pytorch_cos_sim(embed[sentence1], embed[sentence2]) def similarity_of_two_embedding(self, embedding1, embedding2): return util.pytorch_cos_sim(embedding1, embedding2)
class stemmerOP: __wordtokens = wordTokenizer() __word_vec = [] __word_dict = {} __word_dict2 = {} __bp = preprocessor() __dl=downloader() def __init__(self): self.__dl.download('rootword_list',sbnltk_default.sbnltk_root_path+'dataset/') self.__dl.download('ner_static',sbnltk_default.sbnltk_root_path+'dataset/') for word in open(sbnltk_default.sbnltk_root_path + 'dataset/ner_static.txt', "r"): word = word.replace('\n', '') segment = word.split(' ') word = segment[:-1] for i in word: self.__word_dict[i]=1 for word in open(sbnltk_default.sbnltk_root_path+'dataset/rootword_list.txt', "r"): word=word.replace('\n','') self.__word_dict2[word]=1 def __search(self,word): if (self.__bp.word_normalize(word) in self.__word_dict) or (word in self.__word_dict) or (word in self.__word_dict2) or (self.__bp.word_normalize(word) in self.__word_dict2): return True return False def __bnCompare(self,item1,item2): return (len(item1)<len(item2))-(len(item1)>len(item2)) def stemWord(self,word): try: if self.__word_dict2.get(word)!=None: return word suf_arr=[] for wd in rule_words: if re.search('.*' + wd + '$', word): suf_arr.append(wd) suf_arr = sorted(suf_arr, key=functools.cmp_to_key(self.__bnCompare)) if len(suf_arr)>0: for i in suf_arr: if i in rule_dict: ind = len(word) - len(i) new_word=word[0:ind]+rule_dict[i] if self.__search(new_word): return new_word ind = len(word) - len(i) new_word = word[0:ind] if len(new_word)==0: return word if self.__search(new_word): return new_word return word except: print(f"{sbnltk_default.bcolors.FAIL}ERROR 101: Error in stemming!! {sbnltk_default.bcolors.ENDC}") def stemSent(self,sent): tokens=self.__wordtokens.basic_tokenizer(sent) temp_tokens=[] for i in tokens: temp_tokens.append(self.stemWord(i)) result = ' '.join(temp_tokens) return result
class glove_embedding: __dl = downloader() __embeddings_dict = {} def __init__(self): self.__dl.download('glove_embedding', sbnltk_default.sbnltk_root_path + 'model/') self.__dl.download('glove_id2word', sbnltk_default.sbnltk_root_path + 'model/') path = sbnltk_default.sbnltk_root_path + 'model/glove_embedding.pkl' model = pickle.load(open(path, 'rb')) id2word = sbnltk_default.sbnltk_root_path + 'model/glove_id2word.txt' with open(id2word, 'r') as f: for l in f: values = l.split() ind = int(values[0]) word = str(values[1]) vec = model[ind] if len(vec) < 100: continue self.__embeddings_dict[word] = vec def get_vector(self, word): if word in self.__embeddings_dict: return self.__embeddings_dict[word] return np.zeros(100) def cosine_distance(self, word1, word2): vec1 = np.zeros(100) vec2 = np.zeros(100) flg = 0 if word1 in self.__embeddings_dict: vec1 = self.__embeddings_dict[word1] flg += 1 if word2 in self.__embeddings_dict: vec2 = self.__embeddings_dict[word2] flg += 1 if flg == 0: return 0.5 d = 1.0 - spatial.distance.cosine(vec1, vec2) return d def get_nearest_neighbors(self, item, n): vec = [] if item not in self.__embeddings_dict: vec.append(item) return vec result = sorted( self.__embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean( self.__embeddings_dict[word], self.__embeddings_dict[item])) j = 0 for i in result: if j >= n: break vec.append(i) j += 1 return vec
class gensim_word2vec_embedding: __dl = downloader() __embeddings_dict = {} def __init__(self): self.__dl.download('gensim_w2v', sbnltk_default.sbnltk_root_path + 'model/') path = sbnltk_default.sbnltk_root_path + 'model/gensim_w2v.txt' with open(path, 'r') as f: for l in f: values = l.split() word = str(values[0]) vec = np.asarray(values[1:], "float32") if len(vec) < 100: continue self.__embeddings_dict[word] = vec def get_vector(self, word): if word in self.__embeddings_dict: return self.__embeddings_dict[word] return np.zeros(100) def cosine_distance(self, word1, word2): if word1 == word2: return 1.0 vec1 = np.zeros(100) vec2 = np.zeros(100) flg = 0 if word1 in self.__embeddings_dict: vec1 = self.__embeddings_dict[word1] flg += 1 if word2 in self.__embeddings_dict: vec2 = self.__embeddings_dict[word2] flg += 1 if flg <= 1: return 0.5 d = 1.0 - spatial.distance.cosine(vec1, vec2) return d def get_nearest_neighbors(self, item, n): vec = [] if item not in self.__embeddings_dict: vec.append(item) return vec result = sorted( self.__embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean( self.__embeddings_dict[word], self.__embeddings_dict[item])) j = 0 for i in result: if j >= n: break vec.append(i) j += 1 return vec
class static_NER: __ner_static_data = {} __bp = preprocessor() __stemmer = stemmerOP() __dl = downloader() def __init__(self): self.__dl.download('ner_static', sbnltk_default.sbnltk_root_path + 'dataset/') for word in open( sbnltk_default.sbnltk_root_path + 'dataset/ner_static.txt', "r"): word = word.replace('\n', '') segment = word.split(' ') tag = segment[-1] word = segment[:-1] word = ' '.join(word) self.__ner_static_data[word] = tag def tag(self, sentence): segment = sentence.split() stems = self.__stemmer.stemSent(sentence) stems = stems.split() i = 0 sentence_tags = [] while (i < len(segment)): j = len(segment) flg = 0 while (j > i): now = ' '.join(segment[i:j]) now2 = ' '.join(stems[i:j]) if self.__ner_static_data.get(now) != None: sentence_tags.append((now, self.__ner_static_data[now])) i = j - 1 flg = 1 break if self.__ner_static_data.get(now2) != None: sentence_tags.append((now, self.__ner_static_data[now2])) i = j - 1 flg = 1 j -= 1 if flg == 0: sentence_tags.append((segment[i], 'O')) i += 1 return sentence_tags
class sklearn_NER: __dl = downloader() __bp = preprocessor() __sk_model = None def __init__(self): self.__dl.download('sklearn_ner', sbnltk_default.sbnltk_root_path + 'model/') self.__sk_model = pickle.load( open(sbnltk_default.sbnltk_root_path + 'model/sklearn_ner.pkl', 'rb')) def word2features(self, sent, i): return { 'word': sent[i], 'is_first': i == 0, 'is_last': i == len(sent) - 1, 'is_capitalized': sent[i][0].upper() == sent[i][0], 'is_all_caps': sent[i].upper() == sent[i], 'is_all_lower': sent[i].lower() == sent[i], 'prefix-1': sent[i][0], 'prefix-2': sent[i][:2], 'prefix-3': sent[i][:3], 'suffix-1': sent[i][-1], 'suffix-2': sent[i][-2:], 'suffix-3': sent[i][-3:], 'prev_word': '' if i == 0 else sent[i - 1], 'next_word': '' if i == len(sent) - 1 else sent[i + 1], 'is_numeric': sent[i].isdigit() } def tag(self, text): if len(text) == 0: return [] words = text.split() sentence_features = [ self.word2features(words, i) for i in range(len(words)) ] return list(zip(words, self.__sk_model.predict([sentence_features])[0]))
class bert_Multilingual_Uncased_Postag: __model = None __dl = downloader() __device = True if torch.cuda.is_available() else False __module_found = 1 try: import simpletransformers.ner.ner_model as nermodel __module_found = 1 except: __module_found = 0 def __init__(self): if self.__module_found == 0: raise ValueError( 'Please install simpletransformers!! install Command: pip3 install simpletransformers' ) if os.path.exists(sbnltk_default.sbnltk_root_path + 'model/bert_multi_uncased_postag') == False: self.__dl.download('bert_multi_uncased_postag', sbnltk_default.sbnltk_root_path + 'model/') with zipfile.ZipFile( sbnltk_default.sbnltk_root_path + 'model/bert_multi_uncased_postag.zip', 'r') as file: file.extractall(sbnltk_default.sbnltk_root_path + 'model/') os.remove(sbnltk_default.sbnltk_root_path + 'model/bert_multi_uncased_postag.zip') t_h = sbnltk_default.sbnltk_root_path + 'model/bert_multi_uncased_postag/model_args.json' t_g = sbnltk_default.sbnltk_root_path + 'model/bert_multi_uncased_postag/' self.__model = self.nermodel.NERModel('bert', t_g, use_cuda=self.__device, args=t_h) def tag(self, sentences): d, f = self.__model.predict(sentences) return d
class preprocessor: __dl = downloader() __word_list = {} __stopwords = [] def __init__(self): self.__dl.download('bangla_word_list', sbnltk_default.sbnltk_root_path + 'dataset/') self.__dl.download('stopword_list', sbnltk_default.sbnltk_root_path + 'dataset/') for line in open( sbnltk_default.sbnltk_root_path + 'dataset/bangla_word_list.txt', 'r'): line = line.rstrip('\n') self.__word_list[line] = 1 model_path = sbnltk_default.sbnltk_root_path + "dataset/stopword_list.txt" for i in open(model_path, "r"): i = i.rstrip("\n") self.__stopwords.append(i) def punctuation_remove(self, text): try: whitespace = re.compile( u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE) bangla_fullstop = u"\u0964" punctSeq = u"['\"“”‘’]+|[.?!,…]+|[:;]+" punc = u"[(),$%^&*+={}\[\]:\"|\'\~`<>/,¦!?½£¶¼©⅐⅑⅒⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞⅟↉¤¿º;-]+" text = whitespace.sub(" ", text).strip() text = re.sub(punctSeq, " ", text) text = re.sub(bangla_fullstop, " ", text) text = re.sub(punc, " ", text) text = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', ' ', text) text = text.replace("\\", " ") return text except: print( f"{sbnltk_default.bcolors.FAIL} ERROR 201: Error in Removing punctuation!! {sbnltk_default.bcolors.ENDC}" ) return text def dust_removal(self, word): try: s = "" for c in word: g = c.encode("unicode_escape") g = g.upper() g = g[2:] g = g.decode('utf-8') if g in StaticArray.bn2en: s += c if len(s) == 0: return word return s except: print( f"{sbnltk_default.bcolors.FAIL} ERROR 202: Error in Removing dust!! {sbnltk_default.bcolors.ENDC}" ) return word def dust_removal_sent(self, sentence): words = sentence.split() temp = [] for i in words: temp.append(self.dust_removal(i)) temp = ' '.join(temp) return temp def stopword_remove(self, text): try: querywords = text.split() resultwords = [ word for word in querywords if word not in self.__stopwords ] result = ' '.join(resultwords) return result except: print( f"{sbnltk_default.bcolors.FAIL} ERROR 203: Error in Removing stop word!! {sbnltk_default.bcolors.ENDC}" ) return text def word_normalize(self, word): try: s = "" for c in word: g = c.encode("unicode_escape") g = g.upper() g = g[2:] g = g.decode('utf-8') if g in StaticArray.bn_norm: g = StaticArray.bn_norm[g].encode().decode('utf-8') s += g continue s += c return s except: print( f"{sbnltk_default.bcolors.FAIL} ERROR 204: Error in word normalization!! {sbnltk_default.bcolors.ENDC}" ) return word def bangla_to_english_Conversion(self, word): try: s = "" for c in word: g = c.encode("unicode_escape") g = g.upper() g = g[2:] g = g.decode('utf-8') if g in StaticArray.bn2enPunc: if len(s) > 0 and s[-1] == 'a': s = s[:-1] s += StaticArray.bn2enPunc[g] continue if g in StaticArray.bn2en: s += StaticArray.bn2en[g] return s except: print( f"{sbnltk_default.bcolors.FAIL} ERROR 205: Error in Bangla to English Conversion!! {sbnltk_default.bcolors.ENDC}" ) return word def __bnCompare(self, item1, item2): g1 = self.bangla_to_english_Conversion(item1) g2 = self.bangla_to_english_Conversion(item2) return (g1 > g2) - (g1 < g2) def isBanglaWord(self, word): if word in self.__word_list: return True return False def isBangla(self, word): for c in word: g = c.encode("unicode_escape") g = g.upper() g = g[2:] g = g.decode('utf-8') if g in StaticArray.bn2en: return True return False def bn_word_sort_en_sys(self, vec): try: temp_vec = [] for i in vec: if self.isBangla(i): i = self.dust_removal(i) temp_vec.append( self.punctuation_remove(i).replace(' ', '')) vec = list(set(temp_vec)) vec = sorted(vec, key=functools.cmp_to_key(self.__bnCompare)) return vec except: print( f"{sbnltk_default.bcolors.FAIL} ERROR 206: Error in Sort bangla words according English alphabet!! {sbnltk_default.bcolors.ENDC}" ) return vec def __bnCompare2(self, item1, item2): ln = min(len(item1), len(item2)) for i in range(ln): if item1[i] == item2[i]: continue g1 = item1[i].encode("unicode_escape") g1 = g1.upper() g1 = g1[2:] g1 = g1.decode('utf-8') g1 = StaticArray.bn_serial[g1] g2 = item2[i].encode("unicode_escape") g2 = g2.upper() g2 = g2[2:] g2 = g2.decode('utf-8') g2 = StaticArray.bn_serial[g2] return (g1 > g2) - (g1 < g2) return (len(item1) > len(item2)) - (len(item1) < len(item2)) def bn_word_sort_bn_sys(self, vec): try: temp_vec = [] for i in vec: if self.isBangla(i): i = self.dust_removal(i) temp_vec.append( self.punctuation_remove(i).replace(' ', '')) vec = list(set(temp_vec)) vec = sorted(vec, key=functools.cmp_to_key(self.__bnCompare2)) return vec except: print( f"{sbnltk_default.bcolors.FAIL} ERROR 207: Error in Sort Bangla words according Bangla alphabet!! {sbnltk_default.bcolors.ENDC}" ) return vec def is_number(self, word): for c in word: g = c.encode("unicode_escape") g = g.upper() g = g[2:] g = g.decode('utf-8') if g in StaticArray.bn2enNum: return True return False def extra_space_remove(self, sent): while len(sent) > 0 and sent[0] == ' ': sent = sent[1:] temp = '' for i in sent: if len(temp) > 0 and temp[-1] == ' ' and i == ' ': continue temp += i return temp
class sentimentAnalyzer: __dl = downloader() __sentiment_models = [('LR', 'Logistic Regression'), ('LSVC', 'Linear SVC'), ('MNB', 'Multinomial naive bayes'), ('RF', 'Random Forest'), ('BERT', 'Bert Sentiment Analysis')] __root_path = sbnltk_default.sbnltk_root_path def all_sentiment_models(self): st = 'All Sentiment analysis models name with code\n' for sent in self.__sentiment_models: st += sent[1] + ' ::: ' + sent[0] + '\n' return st def __LR(self, sentences): self.__dl.download('sentiment_LR', sbnltk_default.sbnltk_root_path + 'model/') self.__dl.download('sentiment_vector', sbnltk_default.sbnltk_root_path + 'model/') logreg = pickle.load( open(sbnltk_default.sbnltk_root_path + 'model/sentiment_LR.pkl', 'rb')) vectorizer = pickle.load( open( sbnltk_default.sbnltk_root_path + 'model/sentiment_vector.pkl', 'rb')) unknown_vectors = vectorizer.transform(sentences) unknown_words_df = pd.DataFrame(unknown_vectors.toarray(), columns=vectorizer.get_feature_names()) pred = [] prop = [] for i in range(len(sentences)): pred.append(logreg.predict(unknown_words_df)[i]) prop.append(logreg.predict_proba(unknown_words_df)[:, 1][i]) return pred, prop def __LSVC(self, sentences): self.__dl.download('sentiment_LSVC', sbnltk_default.sbnltk_root_path + 'model/') self.__dl.download('sentiment_vector', sbnltk_default.sbnltk_root_path + 'model/') svc = pickle.load( open(sbnltk_default.sbnltk_root_path + 'model/sentiment_LSVC.pkl', 'rb')) vectorizer = pickle.load( open( sbnltk_default.sbnltk_root_path + 'model/sentiment_vector.pkl', 'rb')) unknown_vectors = vectorizer.transform(sentences) unknown_words_df = pd.DataFrame(unknown_vectors.toarray(), columns=vectorizer.get_feature_names()) pred = [] for i in range(len(sentences)): pred.append(svc.predict(unknown_words_df)[i]) return pred def __MNB(self, sentences): self.__dl.download('sentiment_MNB', sbnltk_default.sbnltk_root_path + 'model/') self.__dl.download('sentiment_vector', sbnltk_default.sbnltk_root_path + 'model/') mnb = pickle.load( open(sbnltk_default.sbnltk_root_path + 'model/sentiment_MNB.pkl', 'rb')) vectorizer = pickle.load( open( sbnltk_default.sbnltk_root_path + 'model/sentiment_vector.pkl', 'rb')) unknown_vectors = vectorizer.transform(sentences) unknown_words_df = pd.DataFrame(unknown_vectors.toarray(), columns=vectorizer.get_feature_names()) pred = [] prop = [] for i in range(len(sentences)): pred.append(mnb.predict(unknown_words_df)[i]) prop.append(mnb.predict_proba(unknown_words_df)[:, 1][i]) return pred, prop def __RF(self, sentences): self.__dl.download('sentiment_RF', sbnltk_default.sbnltk_root_path + 'model/') self.__dl.download('sentiment_vector', sbnltk_default.sbnltk_root_path + 'model/') rf = pickle.load( open(sbnltk_default.sbnltk_root_path + 'model/sentiment_RF.pkl', 'rb')) vectorizer = pickle.load( open( sbnltk_default.sbnltk_root_path + 'model/sentiment_vector.pkl', 'rb')) unknown_vectors = vectorizer.transform(sentences) unknown_words_df = pd.DataFrame(unknown_vectors.toarray(), columns=vectorizer.get_feature_names()) pred = [] prop = [] for i in range(len(sentences)): pred.append(rf.predict(unknown_words_df)[i]) prop.append(rf.predict_proba(unknown_words_df)[:, 1][i]) return pred, prop def __sentence_convert_data(self, data): tokenizer = BertTokenizer.from_pretrained( sbnltk_default.sbnltk_root_path + 'model/sentiment_multilingual_vocab.txt') SEQ_LEN = 147 tokens, masks, segments = [], [], [] token = tokenizer.encode(data, max_length=SEQ_LEN, truncation=True, padding='max_length') num_zeros = token.count(0) mask = [1] * (SEQ_LEN - num_zeros) + [0] * num_zeros segment = [0] * SEQ_LEN tokens.append(token) segments.append(segment) masks.append(mask) tokens = np.array(tokens) masks = np.array(masks) segments = np.array(segments) return [tokens, masks, segments] def __b_predict(self, bert, sentences): pred = [] prop = [] for sent in sentences: data_x = self.__sentence_convert_data(sent) predict = bert.predict(data_x) predict_value = np.ravel(predict) predict_answer = np.round(predict_value, 0).item() if predict_answer == 0: pred.append(0) prop.append((1.0 - predict_value[0])) else: pred.append(1) prop.append((predict_value[0])) return pred, prop def __create_sentiment_bert(self): SEQ_LEN = 147 model = TFBertModel.from_pretrained('bert-base-multilingual-cased') token_inputs = tf.keras.layers.Input((SEQ_LEN, ), dtype=tf.int32, name='input_word_ids') mask_inputs = tf.keras.layers.Input((SEQ_LEN, ), dtype=tf.int32, name='input_masks') segment_inputs = tf.keras.layers.Input((SEQ_LEN, ), dtype=tf.int32, name='input_segment') bert_outputs = model([token_inputs, mask_inputs, segment_inputs]) bert_outputs = bert_outputs[1] sentiment_first = tf.keras.layers.Dense( 1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal( stddev=0.02))(bert_outputs) sentiment_model = tf.keras.Model( [token_inputs, mask_inputs, segment_inputs], sentiment_first) opt = tfa.optimizers.RectifiedAdam(lr=2.0e-5, weight_decay=0.0025) sentiment_model.compile(optimizer=opt, loss=tf.keras.losses.BinaryCrossentropy(), metrics=['acc']) return sentiment_model def __BERT(self, sentence): self.__dl.download('sentiment_BERT', sbnltk_default.sbnltk_root_path + 'model/') self.__dl.download('sentiment_multilingual_vocab', sbnltk_default.sbnltk_root_path + 'model/') bert = self.__create_sentiment_bert() bert.load_weights(sbnltk_default.sbnltk_root_path + 'model/sentiment_BERT.h5') return self.__b_predict(bert, sentence) def predict(self, model_code, sentences): if len(sentences) == 0: raise ValueError( 'Empty list of Sentences is detected in Sentiment analysis!!') if model_code == 'LR': pred, prop = self.__LR(sentences) return pred, prop elif model_code == 'LSVC': pred = self.__LSVC(sentences) return pred elif model_code == 'MNB': pred, prop = self.__MNB(sentences) return pred, prop elif model_code == 'RF': pred, prop = self.__RF(sentences) return pred, prop elif model_code == 'BERT': pred, prop = self.__BERT(sentences) return pred, prop else: raise ValueError('Model code Does not exist!!\n' + self.all_sentiment_models())
class bert_multilingual_cased_postag: __dl = downloader() __model = None __tokenizer = None __device = None __tag2idx = { 'CC': 10, 'CD': 8, 'DT': 6, 'IN': 5, 'JJ': 0, 'NN': 4, 'NNP': 3, 'NNS': 1, 'PRE': 12, 'PRF': 9, 'PRP': 13, 'RB': 7, 'VB': 2, 'WH': 11 } __tags2vals = {} # isinstance def __init__(self): self.__device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.__dl.download('bert_multi_cased_postag', sbnltk_default.sbnltk_root_path + 'model/') self.__dl.download('bert_vocab_postag', sbnltk_default.sbnltk_root_path + 'model/') self.__tokenizer = BertTokenizer.from_pretrained( sbnltk_default.sbnltk_root_path + 'model/bert_vocab_postag.txt') self.__model = torch.load(sbnltk_default.sbnltk_root_path + 'model/bert_multi_cased_postag.pth', map_location=self.__device) for i in self.__tag2idx: self.__tags2vals[self.__tag2idx[i]] = i self.__model.eval() def tag(self, sentences): max_seq_len = 128 # tokens batch_s = 8 all_sentence_tags = [] for sentence in sentences: sentence = [sentence] words = sentence[0].split() false_labels = [] for w in range(len(words)): false_labels.append('NN') labels = [false_labels] tokenized_texts = [ self.__tokenizer.tokenize(sent) for sent in sentence ] X = pad_sequences([ self.__tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts ], maxlen=max_seq_len, dtype="long", truncating="post", padding="post") Y = pad_sequences([[self.__tag2idx.get(l) for l in lab] for lab in labels], maxlen=max_seq_len, value=self.__tag2idx["NN"], padding="post", dtype="long", truncating="post") attention_masks = [[float(i > 0) for i in ii] for ii in X] X_train = torch.tensor(X) Y_train = torch.tensor(Y) Mask_train = torch.tensor(attention_masks) data_valid = TensorDataset(X_train, Mask_train, Y_train) data_valid_sampler = SequentialSampler(data_valid) DL_valid = DataLoader(data_valid, sampler=data_valid_sampler, batch_size=batch_s) predictions = [] for batch in DL_valid: batch = tuple(t.to(self.__device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): logits = self.__model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = logits.detach().cpu().numpy() predictions.extend( [list(p) for p in np.argmax(logits, axis=2)]) pred_tags = [[self.__tags2vals[p_i] for p_i in p] for p in predictions] pred_tags = pred_tags[0][:(len(words))] temp_dict = [] for i in range(len(words)): temp_dict.append((words[i], pred_tags[i])) all_sentence_tags.append(temp_dict) return all_sentence_tags