def sentence_similarity(sentence1, sentence2): """ compute the sentence similarity using Wordnet """ # Tokenize and tag sentence1 = pos_tag(word_tokenize(sentence1), 'artagger') sentence2 = pos_tag(word_tokenize(sentence2), 'artagger') # Get the synsets for the tagged words synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1] synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2] # Filter out the Nones synsets1 = [ss for ss in synsets1 if ss] synsets2 = [ss for ss in synsets2 if ss] score, count = 0.0, 0 # For each word in the first sentence for synset in synsets1: # Get the similarity value of the most similar word in the other sentence best_score = max([synset.path_similarity(ss) for ss in synsets2]) # Check that the similarity could have been computed if best_score is not None: score += best_score count += 1 # Average the values if count != 0: score /= count return score
def change_word_tokenize(self, name): if self.dictlist == []: self.words = WordList( pythainlp.word_tokenize(self.text, engine=name)) else: self.words = WordList(pythainlp.word_tokenize( self.text, self.dict)) self.tags = pythainlp.pos_tag(self.words)
def __init__(self, text, dictlist=[]): self.text = text self.dictlist = dictlist if self.dictlist == []: self.words = WordList(pythainlp.word_tokenize(self.text)) else: self.dict = pythainlp.tokenize.dict_trie(self.dictlist) self.words = WordList(pythainlp.word_tokenize( self.text, self.dict)) self.tags = pythainlp.pos_tag(self.words) self.romanize = [romanize_pythainlp(i) for i in self.words] self.word_counts = Counter(self.words)
def tokenize(self, document): # Break the document into sentences for sent in sent_tokenize(document): # Break the sentence into part of speech tagged tokens for token, tag in pos_tag(wordpunct_tokenize(sent),'artagger'): # Apply preprocessing to the token token = token.lower() if self.lower else token token = token.strip() if self.strip else token token = token.strip('_') if self.strip else token token = token.strip('*') if self.strip else token # If stopword, ignore token and continue # if token in self.stopwords: # continue # If punctuation, ignore token and continue if all(char in self.punct for char in token): continue # Lemmatize the token and yield lemma = self.lemmatize(token, tag) yield lemma
def change_pos_tag(self, name): self.tags = pythainlp.pos_tag(self.words, engine=name)
from pythainlp.tokenize import sent_tokenize, word_tokenize from pythainlp import pos_tag text = "ผมชื่อต้นตาล ผมอายุ 40 ปี ผมเล่นเกม" #print(text) sent = sent_tokenize(text) print("จำนวนประโยค : {}".format(str(len(sent)))) #print(sent) for i in range(0, len(sent)): print("Sentence {} is '{}'".format( i + 1, str(pos_tag(word_tokenize(sent[i]), corpus='orchid_ud')))) #print(txt.split("\n"))