def test_ReduceToParsesWithSameRootAndPos(self): self.parse2.reduceToParsesWithSameRootAndPos(Word("kop+VERB")) self.assertEqual(1, self.parse2.size()) self.parse3.reduceToParsesWithSameRootAndPos(Word("topla+VERB")) self.assertEqual(2, self.parse3.size()) self.parse6.reduceToParsesWithSameRootAndPos(Word("karşıla+VERB")) self.assertEqual(2, self.parse6.size())
def __init__(self, fileOrStr=None, languageChecker: LanguageChecker = None): """ Another constructor of Sentence class which takes a fileName as an input. It reads each word in the file and adds to words list. PARAMETERS ---------- fileOrStr: str input file to read words from. """ self.words = [] if isinstance(fileOrStr, io.StringIO): lines = fileOrStr.readlines() for line in lines: wordList = line.split(" ") for word in wordList: self.words.append(Word(word)) fileOrStr.close() elif isinstance(fileOrStr, str): wordArray = fileOrStr.split(" ") for word in wordArray: if len(word) > 0: if languageChecker is None or languageChecker.isValidWord( word): self.words.append(Word(word))
def candidateList(self, word: Word) -> list: """ The candidateList method takes a Word as an input and creates new candidates list. First it adds given word to this list and calls generateCandidateList method with candidates, given word and index 0. Then, loops i times where i ranges from 0 to size of candidates list and calls morphologicalAnalysis method with ith item of candidates list. If it does not return any analysis for given item, it removes the item from candidates list. PARAMETERS ---------- word : Word Word type input. RETURNS ------- list List candidates. """ candidates = [word.getName()] self.__generateCandidateList(candidates, word.getName(), 0) i = 0 while i < len(candidates): fsmParseList = self.fsm.morphologicalAnalysis(candidates[i]) if fsmParseList.size() == 0: candidates.pop(i) i = i - 1 i = i + 1 return candidates
def __init__(self, parse: None): if parse is not None: self.__metaMorphemeList = [] if parse == "+": self.__root = Word("+") else: words = parse.split("\\+") self.__root = Word(words[0]) for i in range(1, len(words)): self.__metaMorphemeList.append(words[i])
def test_GetParseWithLongestRootWord(self): self.assertEqual(Word("kopar"), self.parse2.getParseWithLongestRootWord().root) self.assertEqual(Word("toplama"), self.parse3.getParseWithLongestRootWord().root) self.assertEqual(Word("değerlendirme"), self.parse4.getParseWithLongestRootWord().root) self.assertEqual(Word("soruşturma"), self.parse5.getParseWithLongestRootWord().root) self.assertEqual(Word("karşılaştırmalı"), self.parse6.getParseWithLongestRootWord().root)
def train(self, corpus: DisambiguationCorpus): """ The train method initially creates new NGrams; wordUniGramModel, wordBiGramModel, igUniGramModel, and igBiGramModel. It gets the sentences from given corpus and gets each word as a DisambiguatedWord. Then, adds the word together with its part of speech tags to the wordUniGramModel. It also gets the transition list of that word and adds it to the igUniGramModel. If there exists a next word in the sentence, it adds the current and next {@link DisambiguatedWord} to the wordBiGramModel with their part of speech tags. It also adds them to the igBiGramModel with their transition lists. At the end, it calculates the NGram probabilities of both word and ig unigram models by using LaplaceSmoothing, and both word and ig bigram models by using InterpolatedSmoothing. PARAMETERS ---------- corpus : DisambiguationCorpus DisambiguationCorpus to train. """ words1 = [None] igs1 = [None] words2 = [None, None] igs2 = [None, None] self.wordUniGramModel = NGram(1) self.wordBiGramModel = NGram(2) self.igUniGramModel = NGram(1) self.igBiGramModel = NGram(2) for sentence in corpus.sentences: for j in range(sentence.wordCount()): word = sentence.getWord(j) if isinstance(word, DisambiguatedWord): words1[0] = word.getParse().getWordWithPos() self.wordUniGramModel.addNGram(words1) igs1[0] = Word(word.getParse().getTransitionList()) self.igUniGramModel.addNGram(igs1) if j + 1 < sentence.wordCount(): words2[0] = words1[0] words2[1] = sentence.getWord( j + 1).getParse().getWordWithPos() self.wordBiGramModel.addNGram(words2) igs2[0] = igs1[0] igs2[1] = Word( sentence.getWord(j + 1).getParse().getTransitionList()) self.igBiGramModel.addNGram(igs2) self.wordUniGramModel.calculateNGramProbabilitiesSimple( LaplaceSmoothing()) self.igUniGramModel.calculateNGramProbabilitiesSimple( LaplaceSmoothing()) self.wordBiGramModel.calculateNGramProbabilitiesSimple( LaplaceSmoothing()) self.igBiGramModel.calculateNGramProbabilitiesSimple( LaplaceSmoothing())
def __init__(self, parse:None): if parse is not None: if isinstance(parse, str): iGs = [] st = parse while "^DB+" in st: iGs.append(st[:st.index("^DB+")]) st = st[st.index("^DB+") + 4:] iGs.append(st) self.inflectionalGroups = [] if iGs[0] == "++Punc": self.root = Word("+") self.inflectionalGroups.append(InflectionalGroup("Punc")) else: if iGs[0].index("+") != -1: self.root = Word(iGs[0][:iGs[0].index("+")]) self.inflectionalGroups.append(InflectionalGroup(iGs[0][iGs[0].index("+") + 1:])) else: self.root = Word(iGs[0]) for i in range(1, len(iGs)): self.inflectionalGroups.append(InflectionalGroup(iGs[i])) elif isinstance(parse, list): self.inflectionalGroups = [] if parse[0].index("+") != -1: self.root = Word(parse[0][:parse[0].index("+")]) self.inflectionalGroups.append(InflectionalGroup(parse[0][parse[0].index("+") + 1:])) for i in range(1, len(parse)): self.inflectionalGroups.append(InflectionalGroup(parse[i]))
def __init__(self, parse=None): """ Constructor of MorphologicalParse class which takes a String parse as an input. First it creates a list as iGs for inflectional groups, and while given String contains derivational boundary (^DB+), it adds the substring to the iGs list and continue to use given String from 4th index. If it does not contain ^DB+, it directly adds the given String to the iGs list. Then, it creates a new list as inflectionalGroups and checks for some cases. If the first item of iGs list is ++Punc, it creates a new root as +, and by calling InflectionalGroup method with Punc it initializes the IG list by parsing given input String IG by + and calling the getMorphologicalTag method with these substrings. If getMorphologicalTag method returns a tag, it adds this tag to the IG list and also to the inflectionalGroups list. If the first item of iGs list has +, it creates a new word of first item's substring from index 0 to +, and assigns it to root. Then, by calling InflectionalGroup method with substring from index 0 to +, it initializes the IG list by parsing given input String IG by + and calling the getMorphologicalTag method with these substrings. If getMorphologicalTag method returns a tag, it adds this tag to the IG list and also to the inflectionalGroups list. If the first item of iGs list does not contain +, it creates a new word with first item and assigns it as root. At the end, it loops through the items of iGs and by calling InflectionalGroup method with these items it initializes the IG list by parsing given input String IG by + and calling the getMorphologicalTag method with these substrings. If getMorphologicalTag method returns a tag, it adds this tag to the IG list and also to the inflectionalGroups list. PARAMETERS ---------- parse : str String input. """ if parse is not None: if isinstance(parse, str): iGs = [] st = parse while "^DB+" in st: iGs.append(st[:st.index("^DB+")]) st = st[st.index("^DB+") + 4:] iGs.append(st) self.inflectionalGroups = [] if iGs[0] == "++Punc": self.root = Word("+") self.inflectionalGroups.append(InflectionalGroup("Punc")) else: if iGs[0].index("+") != -1: self.root = Word(iGs[0][:iGs[0].index("+")]) self.inflectionalGroups.append( InflectionalGroup(iGs[0][iGs[0].index("+") + 1:])) else: self.root = Word(iGs[0]) for i in range(1, len(iGs)): self.inflectionalGroups.append( InflectionalGroup(iGs[i])) elif isinstance(parse, list): self.inflectionalGroups = [] if parse[0].index("+") != -1: self.root = Word(parse[0][:parse[0].index("+")]) self.inflectionalGroups.append( InflectionalGroup(parse[0][parse[0].index("+") + 1:])) for i in range(1, len(parse)): self.inflectionalGroups.append(InflectionalGroup(parse[i]))
def spellCheck(self, sentence: Sentence) -> Sentence: """ The spellCheck method takes a Sentence as an input and loops i times where i ranges from 0 to size of words in given sentence. Then, it calls morphologicalAnalysis method with each word and assigns it to the FsmParseList, if the size of FsmParseList is equal to the 0, it adds current word to the candidateList and assigns it to the candidates list. If the size of candidates greater than 0, it generates a random number and selects an item from candidates list with this random number and assign it as newWord. If the size of candidates is not greater than 0, it directly assigns the current word as newWord. At the end, it adds the newWord to the result Sentence. PARAMETERS ---------- sentence : Sentence Sentence type input. RETURNS ------- Sentence Sentence result. """ result = Sentence() for i in range(sentence.wordCount()): word = sentence.getWord(i) fsmParseList = self.fsm.morphologicalAnalysis(word.getName()) if fsmParseList.size() == 0: candidates = self.candidateList(word) if len(candidates) > 0: randomCandidate = randrange(len(candidates)) newWord = Word(candidates[randomCandidate]) else: newWord = word else: newWord = word result.addWord(newWord) return result
def candidateList(self, word: Word) -> list: """ The candidateList method takes a Word as an input and creates a candidates list by calling generateCandidateList method with given word. Then, it loop i times where i ranges from 0 to size of candidates list and creates a FsmParseList by calling morphologicalAnalysis with each item of candidates list. If the size of FsmParseList is 0, it then removes the ith item. PARAMETERS ---------- word : Word Word input. RETURNS ------- list candidates list. """ candidates = self.__generateCandidateList(word.getName()) i = 0 while i < len(candidates): fsmParseList = self.fsm.morphologicalAnalysis(candidates[i]) if fsmParseList.size() == 0: newCandidate = self.fsm.getDictionary().getCorrectForm( candidates[i]) if newCandidate != "" and self.fsm.morphologicalAnalysis( newCandidate).size() > 0: candidates[i] = newCandidate else: candidates.pop(i) i = i - 1 i = i + 1 return candidates
def deasciify(self, sentence: Sentence) -> Sentence: """ The deasciify method takes a Sentence as an input and loops i times where i ranges from 0 to number of words in the given Sentence. First it gets ith word from given Sentence and calls candidateList with ith word and assigns the returned list to the newly created candidates list. And if the size of candidates list is greater than 0, it generates a random number and gets the item of candidates list at the index of random number and assigns it as a newWord. If the size of candidates list is 0, it then directly assigns ith word as the newWord. At the end, it adds newWord to the result Sentence. PARAMETERS ---------- sentence : Sentence Sentence type input. RETURNS ------- Sentence result Sentence. """ result = Sentence() for i in range(sentence.wordCount()): word = sentence.getWord(i) fsmParseList = self.fsm.morphologicalAnalysis(word.getName()) if fsmParseList.size() == 0: candidates = self.candidateList(word) if len(candidates) > 0: randomCandidate = randrange(len(candidates)) newWord = Word(candidates[randomCandidate]) else: newWord = word else: newWord = word result.addWord(newWord) return result
def getBestRootWord(self, fsmParseList: FsmParseList) -> Word: """ The getBestRootWord method takes a FsmParseList as an input and loops through the list. It gets each word with its part of speech tags as a new Word word and its transition list as a Word ig. Then, finds their corresponding probabilities. At the end returns the word with the highest probability. PARAMETERS ---------- fsmParseList : FsmParseList FsmParseList is used to get the part of speech tags and transition lists of words. RETURNS ------- Word The word with the highest probability. """ bestProbability = -1 bestWord = None for j in range(fsmParseList.size()): word = fsmParseList.getFsmParse(j).getWordWithPos() ig = Word(fsmParseList.getFsmParse(j).getTransitionList()) wordProbability = self.wordUniGramModel.getProbability(word) igProbability = self.igUniGramModel.getProbability(ig) probability = wordProbability * igProbability if probability > bestProbability: bestWord = word bestProbability = probability return bestWord
def getParseWithBestIgProbability(self, parseList: FsmParseList, correctFsmParses: list, index: int) -> FsmParse: """ The getParseWithBestIgProbability gets each FsmParse's transition list as a Word ig. Then, finds the corresponding probability. At the end returns the parse with the highest ig probability. PARAMETERS ---------- parseList : FsmParseList FsmParseList is used to get the FsmParse. correctFsmParses : list FsmParse is used to get the transition lists. index : int Index of FsmParse of which transition list will be used to get the probability. RETURNS ------- FsmParse The parse with the highest probability. """ bestParse = None bestProbability = -1 for j in range(parseList.size()): ig = Word(parseList.getFsmParse(j).getTransitionList()) probability = self.getIgProbability(ig, correctFsmParses, index) if probability > bestProbability: bestParse = parseList.getFsmParse(j) bestProbability = probability return bestParse
def getMetaMorphemeTagForParse(self, parse: MorphologicalParse, tag: str) -> list: """ getMetaMorphemeTagForParse method which also takes parse as an input. It also checks the morphotactic tags. PARAMETERS ---------- parse : MorphologicalParse MorphologicalParse type input. tag : str String to get meta morphemes from. RETURNS ------- list List type result which holds meta morphemes. """ result = [] s = tag[0] if Word.isPunctuationSymbol(s): tag = tag[1:] for j in range(len(MetamorphicParse.metaMorphemes)): if tag == self.metaMorphemes[j] and parse.containsTag( MetamorphicParse.morphotacticTags[j]): result.append(MetamorphicParse.morphotacticTags[j]) return result
def train(self, corpus: DisambiguationCorpus): """ The train method gets sentences from given DisambiguationCorpus and both word and the next word of that sentence at each iteration. Then, adds these words together with their part of speech tags to word unigram and bigram models. It also adds the last inflectional group of word to the ig unigram and bigram models. At the end, it calculates the NGram probabilities of both word and ig unigram models by using LaplaceSmoothing, and both word and ig bigram models by using InterpolatedSmoothing. PARAMETERS ---------- corpus : DisambiguationCorpus DisambiguationCorpus to train. """ words1 = [None] igs1 = [None] words2 = [None, None] igs2 = [None, None] self.wordUniGramModel = NGram(1) self.igUniGramModel = NGram(1) self.wordBiGramModel = NGram(2) self.igBiGramModel = NGram(2) for sentence in corpus.sentences: for j in range(sentence.wordCount() - 1): word = sentence.getWord(j) nextWord = sentence.getWord(j + 1) words2[0] = word.getParse().getWordWithPos() words1[0] = words2[0] words2[1] = nextWord.getParse().getWordWithPos() self.wordUniGramModel.addNGram(words1) self.wordBiGramModel.addNGram(words2) for k in range(nextWord.getParse().size()): igs2[0] = Word( word.getParse().getLastInflectionalGroup().__str__()) igs2[1] = Word( nextWord.getParse().getInflectionalGroup(k).__str__()) self.igBiGramModel.addNGram(igs2) igs1[0] = igs2[1] self.igUniGramModel.addNGram(igs1) self.wordUniGramModel.calculateNGramProbabilitiesSimple( LaplaceSmoothing()) self.igUniGramModel.calculateNGramProbabilitiesSimple( LaplaceSmoothing()) self.wordBiGramModel.calculateNGramProbabilitiesSimple( LaplaceSmoothing()) self.igBiGramModel.calculateNGramProbabilitiesSimple( LaplaceSmoothing())
def getMetaMorphemeTag(self, tag: str) -> list: result = [] s = tag[0] if Word.isPunctuationSymbol(s): tag = tag[1:] for j in range(len(MetamorphicParse.metaMorphemes)): if tag == self.metaMorphemes[j]: result.append(MetamorphicParse.morphotacticTags[j]) return result
def test_WordAsciify(self): self.assertEqual("cogusiCOGUSI", self.simpleAsciifier.asciifyWord(Word("çöğüşıÇÖĞÜŞİ"))) self.assertEqual("sogus", self.simpleAsciifier.asciifyWord(Word("söğüş"))) self.assertEqual("uckagitcilik", self.simpleAsciifier.asciifyWord(Word("üçkağıtçılık"))) self.assertEqual("akiskanlistiricilik", self.simpleAsciifier.asciifyWord(Word("akışkanlıştırıcılık"))) self.assertEqual("citcitcilik", self.simpleAsciifier.asciifyWord(Word("çıtçıtçılık"))) self.assertEqual("duskirikligi", self.simpleAsciifier.asciifyWord(Word("düşkırıklığı"))) self.assertEqual("yuzgorumlugu", self.simpleAsciifier.asciifyWord(Word("yüzgörümlüğü")))
def spellCheck(self, sentence: Sentence) -> Sentence: """ The spellCheck method takes a Sentence as an input and loops i times where i ranges from 0 to size of words in given sentence. Then, it calls morphologicalAnalysis method with each word and assigns it to the FsmParseList, if the size of FsmParseList is equal to the 0, it adds current word to the candidateList and assigns it to the candidates list. Later on, it loops through candidates list and calls morphologicalAnalysis method with each word and assigns it to the FsmParseList. Then, it gets the root from FsmParseList. For the first time, it defines a previousRoot by calling getProbability method with root, and for the following times it calls getProbability method with previousRoot and root. Then, it finds out the best probability and the corresponding candidate as best candidate and adds it to the result Sentence. If the size of FsmParseList is not equal to 0, it directly adds the current word to the result Sentence and finds the previousRoot directly from the FsmParseList. PARAMETERS ---------- sentence : Sentence Sentence type input. RETURNS ------- Sentence Sentence result. """ previousRoot = None result = Sentence() for i in range(sentence.wordCount()): word = sentence.getWord(i) fsmParses = self.fsm.morphologicalAnalysis(word.getName()) if fsmParses.size() == 0: candidates = self.candidateList(word) bestCandidate = word.getName() bestRoot = word bestProbability = 0.0 for candidate in candidates: fsmParses = self.fsm.morphologicalAnalysis(candidate) root = fsmParses.getParseWithLongestRootWord().getWord() if previousRoot is not None: probability = self.__nGram.getProbability( previousRoot.getName(), root.getName()) else: probability = self.__nGram.getProbability( root.getName()) if probability > bestProbability: bestCandidate = candidate bestRoot = root bestProbability = probability previousRoot = bestRoot result.addWord(Word(bestCandidate)) else: result.addWord(word) previousRoot = fsmParses.getParseWithLongestRootWord().getWord( ) return result
def __init__(self, parse=None): """ A constructor of MetamorphicParse class which creates an list metaMorphemeList which has split words according to +. PARAMETERS ---------- parse : str String to parse. """ if parse is not None: self.__metaMorphemeList = [] if parse == "+": self.__root = Word("+") else: words = parse.split("\\+") self.__root = Word(words[0]) for i in range(1, len(words)): self.__metaMorphemeList.append(words[i])
def getWordWithPos(self) -> Word: """ The getWordWithPos method returns root with the MorphologicalTag of the first inflectional as a new word. RETURNS ------- Word Root with the MorphologicalTag of the first inflectional as a new word. """ return Word(self.root.getName() + "+" + InflectionalGroup.getTagString(self.firstInflectionalGroup().getTag(0)))
def __init__(self, surfaceForm: str): """ Binary attribute for a given word. If the word is "bay" or "bayan", the attribute will have the value "true", otherwise "false". PARAMETERS ---------- surfaceForm : str Surface form of the word. """ super().__init__(Word.isHonorific(surfaceForm))
def __init__(self, surfaceForm: str): """ Binary attribute for a given word. If the word is "dolar", "euro", "sterlin", etc., the attribute will have the value "true", otherwise "false". PARAMETERS ---------- surfaceForm : str Surface form of the word. """ super().__init__(Word.isMoney(surfaceForm))
def __init__(self, surfaceForm: str): """ Binary attribute for a given word. If the word represents a time form, the attribute will have the value "true", otherwise "false". PARAMETERS ---------- surfaceForm : str Surface form of the word. """ super().__init__(Word.isTime(surfaceForm))
def __init__(self, surfaceForm: str): """ Binary attribute for a given word. If the word is "corp.", "inc." or "co.", the attribute will have the value "true", otherwise "false". PARAMETERS ---------- surfaceForm : str Surface form of the word. """ super().__init__(Word.isOrganization(surfaceForm))
def getMetaMorphemeTagForParse(self, parse: MorphologicalParse, tag: str) -> list: result = [] s = tag[0] if Word.isPunctuation(s): tag = tag[1:] for j in range(len(MetamorphicParse.metaMorphemes)): if tag == self.metaMorphemes[j] and parse.containsTag( MetamorphicParse.morphotacticTags[j]): result.append(MetamorphicParse.morphotacticTags[j]) return result
def test_GetCount(self): self.assertEqual(309, self.corpus.getCount(Word("mustafa"))) self.assertEqual(109, self.corpus.getCount(Word("kemal"))) self.assertEqual(122, self.corpus.getCount(Word("atatürk"))) self.assertEqual(4, self.simpleCorpus.getCount(Word("ali"))) self.assertEqual(3, self.simpleCorpus.getCount(Word("gitti"))) self.assertEqual(4, self.simpleCorpus.getCount(Word("at")))
def isChunkLabel(self) -> bool: """ Checks if this symbol can be a chunk label or not. RETURNS ------- bool True if this symbol can be a chunk label, false otherwise. """ if Word.isPunctuationSymbol(self.name) or self.name.replace("-.*", "") in self.sentenceLabels or \ self.name.replace("-.*", "") in self.phraseLabels: return True return False
def setUp(self) -> None: self.sentence = Sentence() self.sentence.addWord(Word("ali")) self.sentence.addWord(Word("topu")) self.sentence.addWord(Word("at")) self.sentence.addWord(Word("mehmet")) self.sentence.addWord(Word("ayşeyle")) self.sentence.addWord(Word("gitti"))
def contains(self, word: str) -> bool: """ The contains method takes a String word as an input and checks whether wordList CounterHashMap has the given word and returns true if so, otherwise returns false. PARAMETERS ---------- word : str String input to check. RETURNS ------- bool True if wordList has the given word, False otherwise. """ return Word(word) in self.wordList
def wordCount(self, excludeStopWords: bool) -> int: """ Recursive function to count the number of words in the subtree rooted at this node. PARAMETERS ---------- excludeStopWords : bool If true, stop words are not counted. RETURNS ------- int Number of words in the subtree rooted at this node. """ if len(self.children) == 0: if not excludeStopWords: total = 1 else: if Word.isPunctuationSymbol(self.data.getName()) or \ "*" in self.data.getName().lower() or self.data.getName().lower() == "at" or self.data.getName().lower() == "the" or \ self.data.getName().lower() == "to" or self.data.getName().lower() == "a" or self.data.getName().lower() == "an" or \ self.data.getName().lower() == "not" or self.data.getName().lower() == "is" or self.data.getName().lower() == "was" or \ self.data.getName().lower() == "were" or self.data.getName().lower() == "have" or \ self.data.getName().lower() == "had" or self.data.getName().lower() == "has" or \ self.data.getName().lower() == "by" or self.data.getName().lower() == "at" or self.data.getName().lower() == "'re" or \ self.data.getName().lower() == "on" or self.data.getName().lower() == "off" or self.data.getName().lower() == "'s" or \ self.data.getName().lower() == "n't" or self.data.getName().lower() == "can" or \ self.data.getName().lower() == "could" or self.data.getName().lower() == "may" or \ self.data.getName().lower() == "might" or self.data.getName().lower() == "will" or \ self.data.getName().lower() == "would" or self.data.getName().lower() == "as" or\ self.data.getName().lower() == "with" or self.data.getName().lower() == "for" or self.data.getName().lower() == "in" or\ self.data.getName().lower() == "will" or self.data.getName().lower() == "would" or \ self.data.getName().lower() == "than" or self.data.getName().lower() == "$" or \ self.data.getName().lower() == "and" or self.data.getName().lower() == "or" or self.data.getName().lower() == "of" or \ self.data.getName().lower() == "are" or self.data.getName().lower() == "be" or \ self.data.getName().lower() == "been" or self.data.getName().lower() == "do" or self.data.getName().lower() == "few" or\ self.data.getName().lower() == "there" or self.data.getName().lower() == "up" or self.data.getName().lower() == "down": total = 0 else: total = 1 else: total = 0 for child in self.children: total += child.wordCount(excludeStopWords) return total