Exemplo n.º 1
0
class static_postag:
    __dl=downloader()
    __dict={}
    __stemmer=None
    __bp=preprocessor()
    __tokenizer=wordTokenizer()
    def __init__(self):
        self.__dl.download('postag_static',sbnltk_default.sbnltk_root_path+'dataset/')
        self.__stemmer=stemmerOP()
        path=sbnltk_default.sbnltk_root_path+'dataset/postag_static.txt'
        for word in open(path,'r'):
            word=word.replace('\n','')
            tokens=self.__tokenizer.basic_tokenizer(word)
            wd=tokens[0]
            val=tokens[-1]
            self.__dict[wd]=val
    def tag(self,sent):
        tokens=self.__tokenizer.basic_tokenizer(sent)
        ans=[]
        for word in tokens:
            if self.__bp.is_number(word):
                ans.append((word,'NUM'))
                continue
            if self.__dict.get(word):
               ans.append((word,self.__dict[word]))
               continue
            if self.__dict.get(self.__bp.word_normalize(word)) :
                ans.append((word, self.__dict[self.__bp.word_normalize(word)]))
                continue
            stem_word=self.__stemmer.stemWord(word)
            if self.__dict.get(stem_word):
               ans.append((word,self.__dict[stem_word]))
               continue
            ans.append((word,'unk'))
        return ans
Exemplo n.º 2
0
class stemmerOP:
    __wordtokens = wordTokenizer()
    __word_vec = []
    __word_dict = {}
    __word_dict2 = {}
    __bp = preprocessor()
    __dl=downloader()
    def __init__(self):
        self.__dl.download('rootword_list',sbnltk_default.sbnltk_root_path+'dataset/')
        self.__dl.download('ner_static',sbnltk_default.sbnltk_root_path+'dataset/')
        for word in open(sbnltk_default.sbnltk_root_path + 'dataset/ner_static.txt', "r"):
            word = word.replace('\n', '')
            segment = word.split(' ')
            word = segment[:-1]
            for i in word:
                self.__word_dict[i]=1
        for word in open(sbnltk_default.sbnltk_root_path+'dataset/rootword_list.txt', "r"):
            word=word.replace('\n','')
            self.__word_dict2[word]=1
    def __search(self,word):
        if (self.__bp.word_normalize(word) in self.__word_dict) or (word in self.__word_dict) or (word in self.__word_dict2) or (self.__bp.word_normalize(word) in self.__word_dict2):
            return True
        return False
    def __bnCompare(self,item1,item2):
        return (len(item1)<len(item2))-(len(item1)>len(item2))

    def stemWord(self,word):
        try:
            if self.__word_dict2.get(word)!=None:
                return word
            suf_arr=[]
            for wd in rule_words:
                if re.search('.*' + wd + '$', word):
                    suf_arr.append(wd)
            suf_arr = sorted(suf_arr, key=functools.cmp_to_key(self.__bnCompare))
            if len(suf_arr)>0:
                for i in suf_arr:
                    if i in rule_dict:
                        ind = len(word) - len(i)
                        new_word=word[0:ind]+rule_dict[i]
                        if self.__search(new_word):
                            return new_word
                    ind = len(word) - len(i)
                    new_word = word[0:ind]
                    if len(new_word)==0:
                        return word
                    if self.__search(new_word):
                        return new_word
            return word
        except:
            print(f"{sbnltk_default.bcolors.FAIL}ERROR 101: Error in stemming!! {sbnltk_default.bcolors.ENDC}")
    def stemSent(self,sent):
        tokens=self.__wordtokens.basic_tokenizer(sent)
        temp_tokens=[]
        for i in tokens:
            temp_tokens.append(self.stemWord(i))
        result = ' '.join(temp_tokens)

        return result
Exemplo n.º 3
0
class static_NER:
    __ner_static_data = {}
    __bp = preprocessor()
    __stemmer = stemmerOP()
    __dl = downloader()

    def __init__(self):
        self.__dl.download('ner_static',
                           sbnltk_default.sbnltk_root_path + 'dataset/')
        for word in open(
                sbnltk_default.sbnltk_root_path + 'dataset/ner_static.txt',
                "r"):
            word = word.replace('\n', '')
            segment = word.split(' ')
            tag = segment[-1]
            word = segment[:-1]
            word = ' '.join(word)

            self.__ner_static_data[word] = tag

    def tag(self, sentence):
        segment = sentence.split()
        stems = self.__stemmer.stemSent(sentence)
        stems = stems.split()
        i = 0
        sentence_tags = []
        while (i < len(segment)):
            j = len(segment)
            flg = 0
            while (j > i):
                now = ' '.join(segment[i:j])
                now2 = ' '.join(stems[i:j])
                if self.__ner_static_data.get(now) != None:
                    sentence_tags.append((now, self.__ner_static_data[now]))
                    i = j - 1
                    flg = 1
                    break
                if self.__ner_static_data.get(now2) != None:
                    sentence_tags.append((now, self.__ner_static_data[now2]))
                    i = j - 1
                    flg = 1
                j -= 1
            if flg == 0:
                sentence_tags.append((segment[i], 'O'))
            i += 1
        return sentence_tags
Exemplo n.º 4
0
class sklearn_NER:
    __dl = downloader()
    __bp = preprocessor()
    __sk_model = None

    def __init__(self):
        self.__dl.download('sklearn_ner',
                           sbnltk_default.sbnltk_root_path + 'model/')
        self.__sk_model = pickle.load(
            open(sbnltk_default.sbnltk_root_path + 'model/sklearn_ner.pkl',
                 'rb'))

    def word2features(self, sent, i):
        return {
            'word': sent[i],
            'is_first': i == 0,
            'is_last': i == len(sent) - 1,
            'is_capitalized': sent[i][0].upper() == sent[i][0],
            'is_all_caps': sent[i].upper() == sent[i],
            'is_all_lower': sent[i].lower() == sent[i],
            'prefix-1': sent[i][0],
            'prefix-2': sent[i][:2],
            'prefix-3': sent[i][:3],
            'suffix-1': sent[i][-1],
            'suffix-2': sent[i][-2:],
            'suffix-3': sent[i][-3:],
            'prev_word': '' if i == 0 else sent[i - 1],
            'next_word': '' if i == len(sent) - 1 else sent[i + 1],
            'is_numeric': sent[i].isdigit()
        }

    def tag(self, text):
        if len(text) == 0:
            return []
        words = text.split()
        sentence_features = [
            self.word2features(words, i) for i in range(len(words))
        ]
        return list(zip(words,
                        self.__sk_model.predict([sentence_features])[0]))
Exemplo n.º 5
0
 def __init__(self):
     self.__stemmer = stemmerOP()
     self.__pre = preprocessor()
     self.__tokenizer = wordTokenizer()
     self.__sentT = sentenceTokenizer()
     self.__posT = sklearn_postag()
Exemplo n.º 6
0
class sentenceTokenizer:

    __pre=preprocessor()

    def basic_tokenizer(self,text):
        text=text.replace('\n',' ')
        tokens = []
        s = ""
        bangla_fullstop = '0964'
        for c in text:
            g = c.encode("unicode_escape")
            g = g.upper()
            g = g[2:]
            g = g.decode('utf-8')
            if g == bangla_fullstop:
                tokens.append(s)
                s = ""
                continue
            s += c
        if len(s) > 0:
            tokens.append(s)
        return tokens

    def customized_tokenizer(self,text,punc=True,norm=False,dust=False):
        tokens=[]
        text = text.replace('\n', ' ')
        s=""
        bangla_fullstop = '0964'
        for c in text:
            g = c.encode("unicode_escape")
            g = g.upper()
            g = g[2:]
            g = g.decode('utf-8')
            if g==bangla_fullstop:
                tokens.append(s)
                s=""
                continue
            s+=c
        if len(s)>0:
            tokens.append(s)
        try:
            temp_tokens=[]
            for i in tokens:
                if punc==True:
                    i=bp.punctuation_remove(i)
                if norm==True:
                    i=bp.word_normalize(i)
                if len(bp.dust_removal_sent(i))!=0 and dust==True:
                    i=bp.dust_removal_sent(i)
                temp_tokens.append(i)
            return temp_tokens
        except:
            print(f"{sbnltk_default.bcolors.FAIL} ERROR 302: Error in Customized Sentence Tokenizer!! {sbnltk_default.bcolors.ENDC}")
            return tokens

    def sentence_vector_to_text(self,sentences,full_stop=True):
        if full_stop==True:
            text=sbnltk_default.bangla_full_stop.join(sentences)
            text+=sbnltk_default.bangla_full_stop
        else:
            text=' '.join(sentences)
            text=self.__pre.extra_space_remove(text)
        return text

    def sentence_cluster(self,text,max_length=100,punc=True,norm=False,dust=False):
        tokens = []
        text = text.replace('\n', ' ')
        s = ""
        bangla_fullstop = '0964'
        for c in text:
            g = c.encode("unicode_escape")
            g = g.upper()
            g = g[2:]
            g = g.decode('utf-8')
            if g == bangla_fullstop:
                tokens.append(s)
                s = ""
                continue
            s += c
        if len(s) > 0:
            tokens.append(s)
        try:
            '''
            tmp_tokens: temporary tokens for returning
            word_tokens: taking word from each string
            tmp_sent: temporary sentence which length at most : max length
            '''
            tmp_tokens=[]
            for sent in tokens:
                if len(sent)>max_length:
                    l=len(sent)
                    word_tokens=sent.split()
                    tmp_sent=''
                    for w in word_tokens:
                        if len(str(tmp_sent+w))>max_length:
                            tmp_sent=tmp_sent[:-1]
                            tmp_tokens.append(tmp_sent)
                            tmp_sent=''
                        tmp_sent=tmp_sent+w+' '
                    tmp_tokens.append(tmp_sent)
                else:
                    tmp_tokens.append(sent)
            tmp_tokens2=[]
            for sent in tmp_tokens:
                if punc==True:
                    sent=bp.punctuation_remove(sent)
                if norm==True:
                    sent=bp.word_normalize(sent)
                if len(bp.dust_removal_sent(sent)) != 0 and dust == True:
                    sent = bp.dust_removal_sent(sent)
                tmp_tokens2.append(sent)
            return tmp_tokens2
        except:
            print(f"{sbnltk_default.bcolors.FAIL} ERROR 303: Error in Sentence clustering!! {sbnltk_default.bcolors.ENDC}")
            return tokens
Exemplo n.º 7
0
WordTokenizer
    basic_tokenizer: only split words by space
    customized_tokenizer: split words, can remove punctuation (initially True), can make normalization(initially False), can remove dust(intitially False)

SentenceTokenizer:
    basic_tokenizer: only split by Bangla_fullstop
    customized_tokenizer: split sentences, can remove punctuation (initially True), can make normalization(initially False), can remove dust(intitially False)
    sentence_cluster: clustering sentences with max_length, can remove punctuation (initially True), can make normalization(initially False), can remove dust(intitially False)

'''



from sbnltk.Preprocessor import preprocessor
from sbnltk import sbnltk_default
bp=preprocessor()
class wordTokenizer:
    def basic_tokenizer(self,text):
        tokens = text.split()
        return tokens

    def customized_tokenizer(self,text,punc=True,norm=False,dust=False):
        if punc==True:
            text=bp.punctuation_remove(text)
        if norm==True:
            text=bp.word_normalize(text)
        tokens=text.split()
        try:
            if dust==True:
                temp_tokens=[]
                for i in tokens: