Exemplos de trivial_tokenize_indic em Python, exemplos de indicnlp.tokenize.indic_tokenize.trivial_tokenize_indic em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: preprocess_text.py Projeto: AmeyHengle/Attention-ensemble-model-for-Marathi-text-classification

    def tokenize_characters(self, document):
        vocab = set()
        cnt = 0
        token_dict = {}

        if isinstance(document, list):
            #                 print('Doc')
            for text in document:
                char_sequence = self.text2characters(text)
                tokens_indic = pd.Series(trivial_tokenize_indic(char_sequence))
                word_counts = tokens_indic.value_counts()

                vocab = vocab.union(set(word_counts.keys()))

            print('Total Unique Tokens (Characters): {}'.format(len(vocab)))

            for char in vocab:
                cnt += 1
                token_dict[char] = cnt

        else:
            #                 print('sent')
            char_sequence = self.text2characters(document)
            tokens_indic = pd.Series(trivial_tokenize_indic(char_sequence))
            word_counts = tokens_indic.value_counts()
            vocab = vocab.union(set(word_counts.keys()))

            print('Total Unique Tokens (Characters): {}'.format(len(vocab)))

            for char in vocab:
                cnt += 1
                token_dict[char] = cnt

        return token_dict

Exemplo n.º 2

0

Exibir arquivo

Arquivo: guj_final_preporcess-v6 (23).py Projeto: npd2013/Mypublicmodels

def guj_stopwordremoval(originalfile, resultfile):
    #need to send the file name containing data to be tokenized.
    import io
    import re
    #static words in stopword list
    from indicnlp.tokenize import indic_tokenize
    stoplist = [
        'હતાં', 'એમ', 'છે', 'છો', 'છુ', 'હતા', 'હતું', 'હતી', 'હોય', 'હતો',
        'તેમાં', 'અને', 'તથા', 'તો', 'છું'
    ]
    lines = guj_sentence_segmenter(originalfile)
    with io.open(resultfile, 'w+', encoding='utf-8') as w:
        for line in lines:
            newline = ''
            #print("original line for stopword check is ",line)
            alltokens = indic_tokenize.trivial_tokenize_indic(
                line)  #get list of all tokens found from original sentence
            wordtokens = guj_mytokenizer(line)  #get only the words tokens
            for tok in alltokens:
                # print("token recieved is ",tok)
                if tok in wordtokens:
                    #  print("token is word - ",tok)
                    if tok not in stoplist:
                        newline = newline + tok + " "
                else:
                    # print("got non word token ",tok)
                    newline = newline + tok + " "
            #print("newline formed after removal of stopwords is ",newline)
            w.write(newline)
    #guj_clean_extra(resultfile,resultfile)
    return (resultfile)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: guj_final_preporcess-v6 (27).py Projeto: npd2013/Mypublicmodels

def guj_stem_to_lemma(datafile,stemmedlist,lemmalist): #to get back the stem to lemma form in the vocabolory found. 
    import io
    import re
    from indicnlp.tokenize import indic_tokenize
    stemtolemma='lemmatized.csv'
    mycorpora,totalsent=guj_corpus_generate(datafile)   
  
    mappings={}
    with io.open(stemtolemma,'a+',encoding='utf8') as wr: 
     #wr.write("stemmed word , lemmaword(replacement) \n")    
     for lemma in lemmalist: # search is a stemmed version of word in our stemmed list then update the words
      sword=guj_stem_withoutdict(lemma)
      if sword in stemmedlist and sword!=lemma:
       mappings[sword]=lemma
       wr.write("{0},{1}\n".format(sword,lemma))
   
   
    with io.open(datafile,'w+',encoding='utf-8') as w: #store the stemmed sentences back in file
        #divide lines into sentences
     for line in mycorpora:
      alltokens=indic_tokenize.trivial_tokenize_indic(line)#get list of all tokens found from original sentence 
      wordtokens=guj_mytokenizer(line) #fetch only the words in line 
      newline='' #newline to be formed after replacement 
      for basictoken in alltokens:
        if basictoken in wordtokens:
           if basictoken in mappings.keys(): #the basictoken is a word which was trimmed and so in dictionary
               lemmaword=mappings[basictoken]
               newline=newline+lemmaword+" " 
               #newline=re.sub(' '+basictoken+' ',' '+lemmaword+' ',newline)#to ensure only entire word is replaced not if partof word matches
           else:
               newline=newline+basictoken+" "
        else:
            newline=newline+basictoken+" "
      w.write(newline)#write the modified line in result file 
    return(datafile)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: guj_final_preporcess-v6 (27).py Projeto: npd2013/Mypublicmodels

def guj_stemmer_new(originalfile,resultfile,dictionary,focus_list):
#using the dictionary to stem and also focus list to stop stemming the proper names sent 
 import io
 import re
 with io.open(originalfile,'r',encoding='utf-8') as f: 
		  lines=f.read()#get all the lines from file

 stemmedresultfile='stemmingdata.csv' #to store the original word and stem word found for all words in given file . 
 vocablist=guj_makevocab(originalfile) # make vocabulary list of words for given file. 
 #to generate the modified file 
 stemmedwordslist=[]
 lemmawordslist=[]
 replacedict={}
 mycorpora=guj_sentence_segmenter(originalfile) #get the sentences in datafile 
 
 with io.open(stemmedresultfile,'a+',encoding='utf-8') as s: #store the stemmed word set ,  with original word
  for word in vocablist:
      
      isdict=0 #Flag to identify it is dictionary word or not 
      sword,isdict=guj_stem_withdict(word,dictionary,focus_list) #else find stemword and pass the dictionary of words and nouns-named entities available 
          #print('original word , stemmed word and flag ----> ',word,sword,isdict )
      if isdict==1:
              #print("got the dictionary word so adding in dictionary list ",sword)
              lemmawordslist.append(sword) # the dictionary word was found so it was lemma form
      else:
              stemmedwordslist.append(sword) #maintain list of all stemmings done
              
      if sword!=word:
          replacedict[word]=sword #make entry in replacement dictionary ,replace original word with this stemword
          s.write("{0},{1}\n".format(word,sword))
          

 from indicnlp.tokenize import indic_tokenize
 
 
 with io.open(resultfile,'w+',encoding='utf-8') as w: #store the stemmed sentences back in file
        #divide lines into sentences
   for line in mycorpora:
     alltokens=indic_tokenize.trivial_tokenize_indic(line)#get list of all tokens found from original sentence 
     wordtokens=guj_mytokenizer(line) #fetch only the words in line 
     newline=''  #newline to be formed after replacement 
     #print("original line for stemming was---",newline)
     for basictoken in alltokens:
       if basictoken in wordtokens:#the token is word 
           if basictoken in replacedict.keys(): #the basictoken is a word which was trimmed and so in dictionary
               stemword=replacedict[basictoken]
               newline=newline+stemword+" "
           else:
               newline=newline+basictoken+" "
       else:
           newline=newline+basictoken+" "
     #print("new line after stemming was ",newline)    
     w.write(newline)#write the modified line in result file 
 
 guj_clean_extra(resultfile,resultfile)      
 return(resultfile,stemmedwordslist,lemmawordslist)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: preprocess_text.py Projeto: AmeyHengle/Attention-ensemble-model-for-Marathi-text-classification

import tensorflow as tf

tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level = True, split = " ")

tokenizer.fit_on_texts(char_sequence_1)

print(tokenizer.word_counts)


# <h4>Max Jugaad<h4>

# In[91]:


from indicnlp.tokenize.indic_tokenize import trivial_tokenize_indic

tokens_indic = trivial_tokenize_indic(char_sequence_1)

tokens_indic = pd.Series(tokens_indic)

word_counts = tokens_indic.value_counts()
print(word_counts)


# In[ ]:

Exemplo n.º 6

0

Exibir arquivo

Arquivo: guj_final_preporcess-v6 (27).py Projeto: npd2013/Mypublicmodels

def anaphoraresolution(corpora,actorlist,resultfile): #take a corpus oflines and resolve the references      
 import io
 from indicnlp.tokenize import indic_tokenize
 reference_words= ['તે' , 'તેને', 'તેણે',  'તેનો' ,'તેની','તેણે'  ,  'તેનું' ,'તેનુ' , 'તેંનું' , 'તેની' ,'એ','એની','ઍની','એનિ','એણી','ઍનિ',  'એનું' ,'એનુ' ,'ઍનુ' ,'ઍનું' ,'એનૂ' ,'તેમની', 'તેમનિ','તૅમની','તેમને','તેમણે', 'તેમની', 'તેમનો',  'તેમનું' , 'તેમનુ' , 'તેમનૂ' , 'તેમનુઁ' ,'એ','હું']
 
 
 with io.open(resultfile,'w',encoding='utf-8') as w: 
  for (index, line) in enumerate(corpora):
    #print(index,line)
    if index==0: #first line copy directly
        w.write(line)
        continue
    
    if index==1: #secondline find reference to first ,replace if needed
        current, prev = corpora[index], corpora[index - 1]
        all_tokens=indic_tokenize.trivial_tokenize_indic(line)#all tokens in line find    
    
        the_current_words=guj_mytokenizer(current)
        the_prev_words=guj_mytokenizer(prev)
        newline=''
        for token in all_tokens:
         if token in the_current_words and token in reference_words:
           flag=0    
           for prevword in the_prev_words:
                      if prevword in actorlist:
                                 newline=newline+" "+prevword #instead of old word ,use the actor word 
                                 flag=1
                                
           if flag!=1: #no actor word needed ,copy the word as it is in output line
                 newline=newline+" "+token
         else:         #token is not word so copy it in output line 
             newline=newline+" "+token
        w.write(newline)
        continue
    if index>1 and index < len(corpora): #Consider current and prev two lines 
        current, prev,prevprev = corpora[index], corpora[index - 1],corpora[index-2]
        all_tokens=indic_tokenize.trivial_tokenize_indic(line)#all tokens in line   
        the_current_words=guj_mytokenizer(current)
        the_prev_words=guj_mytokenizer(prev)
        the_prev_prev_words=guj_mytokenizer(prevprev)
        newline=''
        for token in all_tokens:
         if token in the_current_words and token  in reference_words:
           flag=0 #indicate whether current word is referencing word or not    
           for prevword in the_prev_words:
                             if prevword in actorlist:
                                 #current=current.replace(word,prevword)
                                 newline=newline+" "+prevword #instead of old word ,use the actor word 
                                 flag=1
                                # print("got the replacement of prev actor here ---",word,prevword)
           if flag!=1:#only if in previousline did not find reference ,try to get in prev to prev line 
              for prevword in the_prev_prev_words:
                             if prevword in actorlist:
                               #  current=current.replace(word,prevword)
                                 newline=newline+" "+prevword
                                 flag=1
           if flag!=1: #no actor word needed ,copy the word as it is in output line
                 newline=newline+" "+token
         else:#the token is not word 
            newline=newline+" "+token    
        w.write(newline)   #now append this new line to list of lines 
        
 return(resultfile)