def deasciify(self, sentence: Sentence) -> Sentence: """ The deasciify method takes a Sentence as an input and loops i times where i ranges from 0 to number of words in the given Sentence. First it gets ith word from given Sentence and calls candidateList with ith word and assigns the returned list to the newly created candidates list. And if the size of candidates list is greater than 0, it generates a random number and gets the item of candidates list at the index of random number and assigns it as a newWord. If the size of candidates list is 0, it then directly assigns ith word as the newWord. At the end, it adds newWord to the result Sentence. PARAMETERS ---------- sentence : Sentence Sentence type input. RETURNS ------- Sentence result Sentence. """ result = Sentence() for i in range(sentence.wordCount()): word = sentence.getWord(i) fsmParseList = self.fsm.morphologicalAnalysis(word.getName()) if fsmParseList.size() == 0: candidates = self.candidateList(word) if len(candidates) > 0: randomCandidate = randrange(len(candidates)) newWord = Word(candidates[randomCandidate]) else: newWord = word else: newWord = word result.addWord(newWord) return result
def spellCheck(self, sentence: Sentence) -> Sentence: """ The spellCheck method takes a Sentence as an input and loops i times where i ranges from 0 to size of words in given sentence. Then, it calls morphologicalAnalysis method with each word and assigns it to the FsmParseList, if the size of FsmParseList is equal to the 0, it adds current word to the candidateList and assigns it to the candidates list. If the size of candidates greater than 0, it generates a random number and selects an item from candidates list with this random number and assign it as newWord. If the size of candidates is not greater than 0, it directly assigns the current word as newWord. At the end, it adds the newWord to the result Sentence. PARAMETERS ---------- sentence : Sentence Sentence type input. RETURNS ------- Sentence Sentence result. """ result = Sentence() for i in range(sentence.wordCount()): word = sentence.getWord(i) fsmParseList = self.fsm.morphologicalAnalysis(word.getName()) if fsmParseList.size() == 0: candidates = self.candidateList(word) if len(candidates) > 0: randomCandidate = randrange(len(candidates)) newWord = Word(candidates[randomCandidate]) else: newWord = word else: newWord = word result.addWord(newWord) return result
def spellCheck(self, sentence: Sentence) -> Sentence: """ The spellCheck method takes a Sentence as an input and loops i times where i ranges from 0 to size of words in given sentence. Then, it calls morphologicalAnalysis method with each word and assigns it to the FsmParseList, if the size of FsmParseList is equal to the 0, it adds current word to the candidateList and assigns it to the candidates list. Later on, it loops through candidates list and calls morphologicalAnalysis method with each word and assigns it to the FsmParseList. Then, it gets the root from FsmParseList. For the first time, it defines a previousRoot by calling getProbability method with root, and for the following times it calls getProbability method with previousRoot and root. Then, it finds out the best probability and the corresponding candidate as best candidate and adds it to the result Sentence. If the size of FsmParseList is not equal to 0, it directly adds the current word to the result Sentence and finds the previousRoot directly from the FsmParseList. PARAMETERS ---------- sentence : Sentence Sentence type input. RETURNS ------- Sentence Sentence result. """ previousRoot = None result = Sentence() for i in range(sentence.wordCount()): word = sentence.getWord(i) fsmParses = self.fsm.morphologicalAnalysis(word.getName()) if fsmParses.size() == 0: candidates = self.candidateList(word) bestCandidate = word.getName() bestRoot = word bestProbability = 0.0 for candidate in candidates: fsmParses = self.fsm.morphologicalAnalysis(candidate) root = fsmParses.getParseWithLongestRootWord().getWord() if previousRoot is not None: probability = self.__nGram.getProbability( previousRoot.getName(), root.getName()) else: probability = self.__nGram.getProbability( root.getName()) if probability > bestProbability: bestCandidate = candidate bestRoot = root bestProbability = probability previousRoot = bestRoot result.addWord(Word(bestCandidate)) else: result.addWord(word) previousRoot = fsmParses.getParseWithLongestRootWord().getWord( ) return result
def asciify(self, sentence: Sentence) -> Sentence: """ Another asciify method which takes a Sentence as an input. It loops i times where i ranges form 0 to number of words in the given sentence. First it gets each word and calls asciify with current word and creates Word with returned String. At the and, adds each newly created ascified words to the result Sentence. PARAMETERS ---------- sentence : Sentence Sentence type input. RETURNS ------- Sentence Sentence output which is asciified. """ result = Sentence() for i in range(sentence.wordCount()): word = sentence.getWord(i) newWord = Word(self.asciifyWord(word)) result.addWord(newWord) return result
def __init__(self, fileName=None): """ Constructor which creates a list of sentences and a CounterHashMap of wordList. """ super().__init__() if fileName is not None: inputFile = open(fileName, "r", encoding="utf8") lines = inputFile.readlines() newSentence = Sentence() for line in lines: word = line[:line.index("\t")] parse = line[line.index("\t") + 1:] if len(word) > 0 and len(parse) > 0: newWord = DisambiguatedWord(word, MorphologicalParse(parse.strip())) if word == "<S>": newSentence = Sentence() elif word == "</S>": self.addSentence(newSentence) elif word == "<DOC>" or word == "</DOC>" or word == "<TITLE>" or word == "</TITLE>": pass else: newSentence.addWord(newWord) inputFile.close()
def split(self, line: str) -> list: """ The split method takes a String line as an input. Firstly it creates a new sentence as currentSentence a new list as sentences. Then loops till the end of the line and checks some conditions; If the char at ith index is a separator; ' : assigns currentWord as currentWord' { : increment the curlyBracketCount } : decrement the curlyBracketCount " : increment the specialQuotaCount " : decrement the specialQuotaCount ( : increment roundParenthesisCount ) : decrement roundParenthesisCount [ : increment bracketCount ] : decrement bracketCount " : assign quotaCount as 1- quotaCount ' : assign apostropheCount as 1- apostropheCount If the currentWord is not empty, it adds the currentWord after repeatControl to currentSentence. If the char at index i is " and bracketCount, specialQuotaCount, curlyBracketCount, roundParenthesisCount, and quotaCount equal to 0 and also the next char is uppercase or digit, it adds currentSentence to sentences. If the char at ith index is a sentence ender; . and currentWord is www : assigns webMode as true. Ex: www.google.com . and currentWord is a digit or in web or e-mail modes : assigns currentWord as currentWord+char(i) Ex: 1. . and currentWord is a shortcut or an abbreviation : assigns currentWord as currentWord+char(i) and adds currentWord to currentSentence. Ex : bkz. ' and next char is uppercase or digit: add word to currentSentence as ' and add currentSentence to sentences. If the char at index i is ' ', i.e space, add word to currentSentence and assign "" to currentSentence. If the char at index i is -, add word to currentSentence and add sentences when the wordCount of currentSentence greater than 0. If the char at ith index is a punctuation; : and if currentWord is "https" : assign webMode as true. , and there exists a number before and after : assign currentWord as currentWord+char(i) Ex: 1,2 : and if line is a time : assign currentWord as currentWord+char(i) Ex: 12:14:24 - and there exists a number before and after : assign currentWord as currentWord+char(i) Ex: 12-1 {@literal @} : assign emailMode as true. PARAMETERS ---------- line : str String input to split. RETURNS ------- list sentences list which holds split line. """ emailMode = False webMode = False i = 0 specialQuotaCount = 0 roundParenthesisCount = 0 bracketCount = 0 curlyBracketCount = 0 quotaCount = 0 apostropheCount = 0 currentSentence = Sentence() currentWord = "" sentences = [] while i < len(line): if line[i] in SentenceSplitter.SEPARATORS: if line[i] == '\'' and currentWord != "" and self.__isApostrophe( line, i): currentWord = currentWord + line[i] else: if currentWord != "": currentSentence.addWord( Word( self.__repeatControl(currentWord, webMode or emailMode))) currentSentence.addWord(Word("" + line[i])) currentWord = "" if line[i] == '{': curlyBracketCount = curlyBracketCount + 1 elif line[i] == '}': curlyBracketCount = curlyBracketCount - 1 elif line[i] == '\uFF02': specialQuotaCount = specialQuotaCount + 1 elif line[i] == '\u05F4': specialQuotaCount = specialQuotaCount - 1 elif line[i] == '(': roundParenthesisCount = roundParenthesisCount + 1 elif line[i] == ')': roundParenthesisCount = roundParenthesisCount - 1 elif line[i] == '[': bracketCount = bracketCount + 1 elif line[i] == ']': bracketCount = bracketCount - 1 elif line[i] == '"': quotaCount = 1 - quotaCount elif line[i] == '\'': apostropheCount = 1 - apostropheCount if line[i] == '"' and bracketCount == 0 and specialQuotaCount == 0 and curlyBracketCount == 0 and \ roundParenthesisCount == 0 and quotaCount == 0 and self.__isNextCharUpperCaseOrDigit(line, i + 1): sentences.append(currentSentence) currentSentence = Sentence() else: if line[i] in SentenceSplitter.SENTENCE_ENDERS: if line[i] == '.' and currentWord == "www": webMode = True if line[i] == '.' and currentWord != "" and ( webMode or emailMode or line[i - 1] in TurkishLanguage.DIGITS): currentWord = currentWord + line[i] else: if line[i] == '.' and ( self.__listContains(currentWord) or self.__isNameShortcut(currentWord)): currentWord = currentWord + line[i] currentSentence.addWord(Word(currentWord)) currentWord = "" else: if currentWord != "": currentSentence.addWord( Word( self.__repeatControl( currentWord, webMode or emailMode))) currentWord = "" + line[i] i = i + 1 while i < len(line) and line[ i] in SentenceSplitter.SENTENCE_ENDERS: i = i + 1 i = i - 1 currentSentence.addWord(Word(currentWord)) if roundParenthesisCount == 0 and bracketCount == 0 and curlyBracketCount == 0 and \ quotaCount == 0: if i + 1 < len(line) and line[i + 1] == '\'' and apostropheCount == 1 and \ self.__isNextCharUpperCaseOrDigit(line, i + 2): currentSentence.addWord(Word("'")) i = i + 1 sentences.append(currentSentence) currentSentence = Sentence() else: if i + 2 < len(line) and line[i + 1] == ' ' and line[i + 2] == '\'' and \ apostropheCount == 1 and self.__isNextCharUpperCaseOrDigit(line, i + 3): currentSentence.addWord(Word("'")) i += 2 sentences.append(currentSentence) currentSentence = Sentence() else: if self.__isNextCharUpperCaseOrDigit( line, i + 1): sentences.append(currentSentence) currentSentence = Sentence() currentWord = "" else: if line[i] == ' ': emailMode = False webMode = False if currentWord != "": currentSentence.addWord( Word( self.__repeatControl( currentWord, webMode or emailMode))) currentWord = "" else: if line[i] == '-' and not webMode and roundParenthesisCount == 0 and \ self.__isNextCharUpperCase(line, i + 1) and \ not self.__isPreviousWordUpperCase(line, i - 1): if currentWord != "" and currentWord not in TurkishLanguage.DIGITS: currentSentence.addWord( Word( self.__repeatControl( currentWord, webMode or emailMode))) if currentSentence.wordCount() > 0: sentences.append(currentSentence) currentSentence = Sentence() roundParenthesisCount = 0 bracketCount = 0 curlyBracketCount = 0 quotaCount = 0 specialQuotaCount = 0 if currentWord != "" and re.match( "\\d+", currentWord): currentSentence.addWord( Word(currentWord + " -")) else: currentSentence.addWord(Word("-")) currentWord = "" else: if line[i] in SentenceSplitter.PUNCTUATION_CHARACTERS or \ line[i] in TurkishLanguage.ARITHMETIC_CHARACTERS: if line[i] == ':' and (currentWord == "http" or currentWord == "https"): webMode = True if webMode: currentWord = currentWord + line[i] else: if line[i] == ',' and self.__numberExistsBeforeAndAfter( line, i): currentWord = currentWord + line[i] else: if line[i] == ':' and self.__isTime( line, i): currentWord = currentWord + line[i] else: if line[i] == '-' and self.__numberExistsBeforeAndAfter( line, i): currentWord = currentWord + line[ i] else: if currentWord != "": currentSentence.addWord( Word( self. __repeatControl( currentWord, webMode or emailMode))) currentSentence.addWord( Word("" + line[i])) currentWord = "" else: if line[i] == '@': currentWord = currentWord + line[i] emailMode = True else: currentWord = currentWord + line[i] i = i + 1 if currentWord != "": currentSentence.addWord( Word(self.__repeatControl(currentWord, webMode or emailMode))) if currentSentence.wordCount() > 0: sentences.append(currentSentence) return sentences
def deasciify(self, sentence: Sentence) -> Sentence: """ The deasciify method takes a Sentence as an input. First it creates a String list as candidates, and a Sentence result. Then, loops i times where i ranges from 0 to words size of given sentence. It gets the current word and generates a candidateList with this current word then, it loops through the candidateList. First it calls morphologicalAnalysis method with current candidate and gets the first item as root word. If it is the first root, it gets its N-gram probability, if there are also other roots, it gets probability of these roots and finds out the best candidate, best root and the best probability. At the nd, it adds the bestCandidate to the bestCandidate list. PARAMETERS ---------- sentence : Sentence Sentence type input. RETURNS ------- Sentence Sentence result as output. """ previousRoot = None result = Sentence() root = self.checkAnalysisAndSetRoot(sentence, 0) nextRoot = self.checkAnalysisAndSetRoot(sentence, 1) for i in range(sentence.wordCount()): word = sentence.getWord(i) if root is None: candidates = self.candidateList(word) bestCandidate = word.getName() bestRoot = word bestProbability = self.__threshold for candidate in candidates: fsmParses = self.fsm.morphologicalAnalysis(candidate) if self.__rootNgram: root = fsmParses.getParseWithLongestRootWord().getWord( ) else: root = Word(candidate) if previousRoot is not None: previousProbability = self.__nGram.getProbability( previousRoot.getName(), root.getName()) else: previousProbability = 0.0 if nextRoot is not None: nextProbability = self.__nGram.getProbability( root.getName(), nextRoot.getName()) else: nextProbability = 0.0 if max(previousProbability, nextProbability) > bestProbability: bestCandidate = candidate bestRoot = root bestProbability = max(previousProbability, nextProbability) root = bestRoot result.addWord(Word(bestCandidate)) else: result.addWord(word) previousRoot = root root = nextRoot nextRoot = self.checkAnalysisAndSetRoot(sentence, i + 2) return result
class SentenceTest(unittest.TestCase): sentence: Sentence def setUp(self) -> None: self.sentence = Sentence() self.sentence.addWord(Word("ali")) self.sentence.addWord(Word("topu")) self.sentence.addWord(Word("at")) self.sentence.addWord(Word("mehmet")) self.sentence.addWord(Word("ayşeyle")) self.sentence.addWord(Word("gitti")) def test_GetWord(self): self.assertEqual(Word("ali"), self.sentence.getWord(0)) self.assertEqual(Word("at"), self.sentence.getWord(2)) self.assertEqual(Word("gitti"), self.sentence.getWord(5)) def test_GetIndex(self): self.assertEqual(0, self.sentence.getIndex(Word("ali"))) self.assertEqual(2, self.sentence.getIndex(Word("at"))) self.assertEqual(5, self.sentence.getIndex(Word("gitti"))) def test_WordCount(self): self.assertEqual(6, self.sentence.wordCount()) def test_CharCount(self): self.assertEqual(27, self.sentence.charCount())