def autoLabelSingleSemantics(self, sentence: AnnotatedSentence) -> bool:
     random.seed(1)
     for i in range(sentence.wordCount()):
         synSets = self.getCandidateSynSets(self.__turkishWordNet, self.__fsm, sentence, i)
         if len(synSets) > 0:
             sentence.getWord(i).setSemantic(synSets[random.randrange(len(synSets))].getId())
     return True
    def autoNER(self, sentence: AnnotatedSentence):
        """
        The main method to automatically detect named entities in a sentence. The algorithm
        1. Detects PERSON(s).
        2. Detects LOCATION(s).
        3. Detects ORGANIZATION(s).
        4. Detects MONEY.
        5. Detects TIME.
        For not detected words, the algorithm sets the named entity "NONE".

        PARAMETERS
        ----------
        sentence : AnnotatedSentence
            The sentence for which named entities checked.
        """
        self.autoDetectPerson(sentence)
        self.autoDetectLocation(sentence)
        self.autoDetectOrganization(sentence)
        self.autoDetectMoney(sentence)
        self.autoDetectTime(sentence)
        for i in range(sentence.wordCount()):
            word = sentence.getWord(i)
            if isinstance(word, AnnotatedWord):
                if word.getNamedEntityType() is None:
                    word.setNamedEntityType("NONE")
 def autoLabelSingleSemantics(self, sentence: AnnotatedSentence) -> bool:
     for i in range(sentence.wordCount()):
         synSets = self.getCandidateSynSets(self.__turkishWordNet, self.__fsm, sentence, i)
         if len(synSets) > 0:
             best = self.mostFrequent(synSets, sentence.getWord(i).getParse().getWord().getName())
             if best is not None:
                 sentence.getWord(i).setSemantic(best.getId())
     return True
    def autoArgument(self, sentence: AnnotatedSentence) -> bool:
        """
        Given the sentence for which the predicate(s) were determined before, this method automatically assigns
        semantic role labels to some/all words in the sentence. The method first finds the first predicate, then
        assuming that the shallow parse tags were preassigned, assigns ÖZNE tagged words ARG0; NESNE tagged words ARG1.
        If the verb is in passive form, ÖZNE tagged words are assigned as ARG1.

        PARAMETERS
        ----------
        sentence : AnnotatedSentence
            The sentence for which semantic roles will be determined automatically.

        RETURNS
        -------
        bool
            If the method assigned at least one word a semantic role label, the method returns true; false otherwise.
        """
        modified = False
        predicateId = None
        for i in range(sentence.wordCount()):
            word = sentence.getWord(i)
            if isinstance(word, AnnotatedWord):
                if word.getArgument() is not None and word.getArgument(
                ).getArgumentType() == "PREDICATE":
                    predicateId = word.getArgument().getId()
                    break
        if predicateId is not None:
            for i in range(sentence.wordCount()):
                word = sentence.getWord(i)
                if isinstance(word,
                              AnnotatedWord) and word.getArgument() is None:
                    if word.getShallowParse(
                    ) is not None and word.getShallowParse() == "ÖZNE":
                        if word.getParse() is not None and word.getParse(
                        ).containsTag(MorphologicalTag.PASSIVE):
                            word.setArgument("ARG1$" + predicateId)
                        else:
                            word.setArgument("ARG0$" + predicateId)
                        modified = True
                    else:
                        if word.getShallowParse(
                        ) is not None and word.getShallowParse() == "NESNE":
                            word.setArgument("ARG1$" + predicateId)
                            modified = True
        return modified
 def getCandidateSynSets(self, wordNet: WordNet,
                         fsm: FsmMorphologicalAnalyzer,
                         sentence: AnnotatedSentence, index: int) -> list:
     twoPrevious = None
     previous = None
     twoNext = None
     next = None
     current = sentence.getWord(index)
     if index > 1:
         twoPrevious = sentence.getWord(index - 2)
     if index > 0:
         previous = sentence.getWord(index - 1)
     if index != sentence.wordCount() - 1:
         next = sentence.getWord(index + 1)
     if index < sentence.wordCount() - 2:
         twoNext = sentence.getWord(index + 2)
     synSets = wordNet.constructSynSets(
         current.getParse().getWord().getName(), current.getParse(),
         current.getMetamorphicParse(), fsm)
     if twoPrevious is not None and twoPrevious.getParse(
     ) is not None and previous.getParse() is not None:
         synSets.extend(
             wordNet.constructIdiomSynSets(
                 fsm, twoPrevious.getParse(),
                 twoPrevious.getMetamorphicParse(), previous.getParse(),
                 previous.getMetamorphicParse(), current.getParse(),
                 current.getMetamorphicParse()))
     if previous is not None and previous.getParse(
     ) is not None and next is not None and next.getParse() is not None:
         synSets.extend(
             wordNet.constructIdiomSynSets(fsm, previous.getParse(),
                                           previous.getMetamorphicParse(),
                                           current.getParse(),
                                           current.getMetamorphicParse(),
                                           next.getParse(),
                                           next.getMetamorphicParse()))
     if next is not None and next.getParse(
     ) is not None and twoNext is not None and twoNext.getParse(
     ) is not None:
         synSets.extend(
             wordNet.constructIdiomSynSets(fsm, current.getParse(),
                                           current.getMetamorphicParse(),
                                           next.getParse(),
                                           next.getMetamorphicParse(),
                                           twoNext.getParse(),
                                           twoNext.getMetamorphicParse()))
     if previous is not None and previous.getParse() is not None:
         synSets.extend(
             wordNet.constructIdiomSynSets(fsm, previous.getParse(),
                                           previous.getMetamorphicParse(),
                                           current.getParse(),
                                           current.getMetamorphicParse()))
     if next is not None and next.getParse() is not None:
         synSets.extend(
             wordNet.constructIdiomSynSets(fsm, current.getParse(),
                                           current.getMetamorphicParse(),
                                           next.getParse(),
                                           next.getMetamorphicParse()))
     return synSets
    def generate(self) -> DisambiguationCorpus:
        """
        Creates a morphological disambiguation corpus from the corpus.

        RETURNS
        -------
        DisambiguationCorpus
            Created disambiguation corpus.
        """
        corpus = DisambiguationCorpus()
        for i in range(self.__annotatedCorpus.sentenceCount()):
            sentence = self.__annotatedCorpus.getSentence(i)
            disambiguationSentence = AnnotatedSentence()
            for j in range(sentence.wordCount()):
                annotatedWord = sentence.getWord(j)
                if isinstance(annotatedWord, AnnotatedWord):
                    disambiguationSentence.addWord(
                        DisambiguatedWord(annotatedWord.getName(),
                                          annotatedWord.getParse()))
            corpus.addSentence(disambiguationSentence)
        return corpus
 def autoLabelSingleSemantics(self, sentence: AnnotatedSentence) -> bool:
     random.seed(1)
     done = False
     for i in range(sentence.wordCount()):
         synSets = self.getCandidateSynSets(self.__turkishWordNet,
                                            self.__fsm, sentence, i)
         maxIntersection = -1
         for j in range(len(synSets)):
             synSet = synSets[j]
             intersectionCount = self.intersection(synSet, sentence)
             if intersectionCount > maxIntersection:
                 maxIntersection = intersectionCount
         maxSynSets = []
         for j in range(len(synSets)):
             synSet = synSets[j]
             if self.intersection(synSet, sentence) == maxIntersection:
                 maxSynSets.append(synSet)
         if len(maxSynSets) > 0:
             done = True
             sentence.getWord(i).setSemantic(maxSynSets[randrange(
                 len(maxSynSets))].getId())
     return done
 def intersection(self, synSet: SynSet, sentence: AnnotatedSentence) -> int:
     if synSet.getExample() is not None:
         words1 = (synSet.getLongDefinition() + " " +
                   synSet.getExample()).split(" ")
     else:
         words1 = synSet.getLongDefinition().split(" ")
     words2 = sentence.toString().split(" ")
     count = 0
     for word1 in words1:
         for word2 in words2:
             if word1.lower() == word2.lower():
                 count = count + 1
     return count
    def generate(self) -> DisambiguationCorpus:
        """
        Creates a morphological disambiguation corpus from the treeBank. Calls generateAnnotatedSentence for each parse
        tree in the treebank.

        RETURNS
        -------
        DisambiguationCorpus
            Created disambiguation corpus.
        """
        corpus = DisambiguationCorpus()
        for i in range(self.__treeBank.size()):
            parseTree = self.__treeBank.get(i)
            if parseTree.layerAll(ViewLayerType.INFLECTIONAL_GROUP):
                sentence = parseTree.generateAnnotatedSentence()
                disambiguationSentence = AnnotatedSentence()
                for j in range(sentence.wordCount()):
                    annotatedWord = sentence.getWord(j)
                    if isinstance(annotatedWord, AnnotatedWord):
                        disambiguationSentence.addWord(
                            DisambiguatedWord(annotatedWord.getName(),
                                              annotatedWord.getParse()))
                corpus.addSentence(disambiguationSentence)
        return corpus
示例#10
0
    def autoPredicate(self, sentence: AnnotatedSentence) -> bool:
        """
        The method uses predicateFrameCandidates method to predict possible predicates. For each candidate, it sets for that
        word PREDICATE tag.

        PARAMETERS
        ----------
        sentence : AnnotatedSentence
            The sentence for which predicates will be determined automatically.

        RETURNS
        -------
        bool
            If at least one word has been tagged, true; false otherwise.
        """
        candidateList = sentence.predicateFrameCandidates(self.__frameNet)
        for word in candidateList:
            if isinstance(word, AnnotatedWord):
                word.setFrameElement("PREDICATE$NONE$" + word.getSemantic())
        if len(candidateList) > 0:
            return True
        return False
    def __init__(self, folder: str, pattern: str = None):
        """
        A constructor of AnnotatedCorpus class which reads all AnnotatedSentence files with the file
        name satisfying the given pattern inside the given folder. For each file inside that folder, the constructor
        creates an AnnotatedSentence and puts in inside the list parseTrees.

        PARAMETERS
        ----------
        folder : str
            Folder where all sentences reside.
        pattern : str
            File pattern such as "." ".train" ".test".
        """
        self.sentences = []
        for root, dirs, files in os.walk(folder):
            for file in files:
                fileName = os.path.join(root, file)
                if (pattern is None or pattern in fileName) and re.match(
                        "\\d+\\.", file):
                    f = open(fileName, "r", encoding='utf8')
                    sentence = AnnotatedSentence(f, fileName)
                    self.sentences.append(sentence)
示例#12
0
 def generateAnnotatedSentence(self,
                               language: str = None) -> AnnotatedSentence:
     sentence = AnnotatedSentence()
     if language is None:
         nodeDrawableCollector = NodeDrawableCollector(
             self.root, IsTurkishLeafNode())
         leafList = nodeDrawableCollector.collect()
         for parseNode in leafList:
             if isinstance(parseNode, ParseNodeDrawable):
                 layers = parseNode.getLayerInfo()
                 for i in range(layers.getNumberOfWords()):
                     sentence.addWord(layers.toAnnotatedWord(i))
     else:
         nodeDrawableCollector = NodeDrawableCollector(
             self.root, IsEnglishLeafNode())
         leafList = nodeDrawableCollector.collect()
         for parseNode in leafList:
             if isinstance(parseNode, ParseNodeDrawable):
                 newWord = AnnotatedWord(
                     "{" + language + "=" + parseNode.getData().getName() +
                     "}{posTag=" +
                     parseNode.getParent().getData().getName() + "}")
                 sentence.addWord(newWord)
     return sentence
示例#13
0
    def autoLabelSingleSemantics(self, sentence: AnnotatedSentence):
        """
        The method checks
        1. the previous two words and the current word; the previous, current and next word, current and the next
        two words for a three word multiword expression that occurs in the Turkish wordnet.
        2. the previous word and current word; current word and the next word for a two word multiword expression that
        occurs in the Turkish wordnet.
        3. the current word
        if it has only one sense. If there is only one sense for that multiword expression or word; it sets that sense.

        PARAMETERS
        ----------
        sentence : AnnotatedSentence
            The sentence for which word sense disambiguation will be determined automatically.
        """
        for i in range(sentence.wordCount()):
            current = sentence.getWord(i)
            if i > 1:
                twoPrevious = sentence.getWord(i - 2)
            if i > 0:
                previous = sentence.getWord(i - 1)
            if i != sentence.wordCount() - 1:
                next = sentence.getWord(i + 1)
            if i < sentence.wordCount() - 2:
                twoNext = sentence.getWord(i + 2)
            if isinstance(current, AnnotatedWord) and current.getSemantic(
            ) is None and current.getParse() is not None:
                if twoPrevious is not None and isinstance(twoPrevious, AnnotatedWord) \
                        and twoPrevious.getParse() is not None and isinstance(previous, AnnotatedWord) \
                        and previous.getParse() is not None:
                    idioms = self.__turkishWordNet.constructIdiomSynSets(
                        self.__fsm, twoPrevious.getParse(),
                        twoPrevious.getMetamorphicParse(), previous.getParse(),
                        previous.getMetamorphicParse(), current.getParse(),
                        current.getMetamorphicParse())
                    if len(idioms) == 1:
                        current.setSemantic(idioms[0].getId())
                        continue
                if previous is not None and isinstance(previous, AnnotatedWord) \
                        and previous.getParse() is not None and next is not None and isinstance(next, AnnotatedWord) \
                        and next.getParse() is not None:
                    idioms = self.__turkishWordNet.constructIdiomSynSets(
                        self.__fsm, previous.getParse(),
                        previous.getMetamorphicParse(), current.getParse(),
                        current.getMetamorphicParse(), next.getParse(),
                        next.getMetamorphicParse())
                    if len(idioms) == 1:
                        current.setSemantic(idioms[0].getId())
                        continue
                if next is not None and isinstance(next, AnnotatedWord) \
                        and next.getParse() is not None and twoNext is not None and isinstance(twoNext, AnnotatedWord) \
                        and twoNext.getParse() is not None:
                    idioms = self.__turkishWordNet.constructIdiomSynSets(
                        self.__fsm, current.getParse(),
                        current.getMetamorphicParse(), next.getParse(),
                        next.getMetamorphicParse(), twoNext.getParse(),
                        twoNext.getMetamorphicParse())
                    if len(idioms) == 1:
                        current.setSemantic(idioms[0].getId())
                        continue
                if previous is not None and isinstance(
                        previous,
                        AnnotatedWord) and previous.getParse() is not None:
                    idioms = self.__turkishWordNet.constructIdiomSynSets(
                        self.__fsm, previous.getParse(),
                        previous.getMetamorphicParse(), current.getParse(),
                        current.getMetamorphicParse())
                    if len(idioms) == 1:
                        current.setSemantic(idioms[0].getId())
                        continue
                if next is not None and isinstance(
                        next, AnnotatedWord) and next.getParse() is not None:
                    idioms = self.__turkishWordNet.constructIdiomSynSets(
                        self.__fsm, current.getParse(),
                        current.getMetamorphicParse(), next.getParse(),
                        next.getMetamorphicParse())
                    if len(idioms) == 1:
                        current.setSemantic(idioms[0].getId())
                        continue
                meanings = self.__turkishWordNet.constructSynSets(
                    current.getParse().getWord().getName(), current.getParse(),
                    current.getMetamorphicParse(), self.__fsm)
                if current.getSemantic() is None and len(meanings) == 1:
                    current.setSemantic(meanings[0].getId())
class AnnotatedSentenceTest(unittest.TestCase):

    sentence0: AnnotatedSentence
    sentence1: AnnotatedSentence
    sentence2: AnnotatedSentence
    sentence3: AnnotatedSentence
    sentence4: AnnotatedSentence
    sentence5: AnnotatedSentence
    sentence6: AnnotatedSentence
    sentence7: AnnotatedSentence
    sentence8: AnnotatedSentence
    sentence9: AnnotatedSentence

    def setUp(self) -> None:
        self.sentence0 = AnnotatedSentence(open("../sentences/0000.dev", "r", encoding='utf8'), "../sentences/0000.dev")
        self.sentence1 = AnnotatedSentence(open("../sentences/0001.dev", "r", encoding='utf8'), "../sentences/0001.dev")
        self.sentence2 = AnnotatedSentence(open("../sentences/0002.dev", "r", encoding='utf8'), "../sentences/0002.dev")
        self.sentence3 = AnnotatedSentence(open("../sentences/0003.dev", "r", encoding='utf8'), "../sentences/0003.dev")
        self.sentence4 = AnnotatedSentence(open("../sentences/0004.dev", "r", encoding='utf8'), "../sentences/0004.dev")
        self.sentence5 = AnnotatedSentence(open("../sentences/0005.dev", "r", encoding='utf8'), "../sentences/0005.dev")
        self.sentence6 = AnnotatedSentence(open("../sentences/0006.dev", "r", encoding='utf8'), "../sentences/0006.dev")
        self.sentence7 = AnnotatedSentence(open("../sentences/0007.dev", "r", encoding='utf8'), "../sentences/0007.dev")
        self.sentence8 = AnnotatedSentence(open("../sentences/0008.dev", "r", encoding='utf8'), "../sentences/0008.dev")
        self.sentence9 = AnnotatedSentence(open("../sentences/0009.dev", "r", encoding='utf8'), "../sentences/0009.dev")

    def test_GetShallowParseGroups(self):
        self.assertEquals(4, len(self.sentence0.getShallowParseGroups()))
        self.assertEquals(5, len(self.sentence1.getShallowParseGroups()))
        self.assertEquals(3, len(self.sentence2.getShallowParseGroups()))
        self.assertEquals(5, len(self.sentence3.getShallowParseGroups()))
        self.assertEquals(5, len(self.sentence4.getShallowParseGroups()))
        self.assertEquals(5, len(self.sentence5.getShallowParseGroups()))
        self.assertEquals(6, len(self.sentence6.getShallowParseGroups()))
        self.assertEquals(5, len(self.sentence7.getShallowParseGroups()))
        self.assertEquals(5, len(self.sentence8.getShallowParseGroups()))
        self.assertEquals(3, len(self.sentence9.getShallowParseGroups()))

    def test_ContainsPredicate(self):
        self.assertTrue(self.sentence0.containsPredicate())
        self.assertTrue(self.sentence1.containsPredicate())
        self.assertFalse(self.sentence2.containsPredicate())
        self.assertTrue(self.sentence3.containsPredicate())
        self.assertTrue(self.sentence4.containsPredicate())
        self.assertFalse(self.sentence5.containsPredicate())
        self.assertFalse(self.sentence6.containsPredicate())
        self.assertTrue(self.sentence7.containsPredicate())
        self.assertTrue(self.sentence8.containsPredicate())
        self.assertTrue(self.sentence9.containsPredicate())

    def test_GetPredicate(self):
        self.assertEquals("bulandırdı", self.sentence0.getPredicate(0))
        self.assertEquals("yapacak", self.sentence1.getPredicate(0))
        self.assertEquals("ediyorlar", self.sentence3.getPredicate(0))
        self.assertEquals("yazmıştı", self.sentence4.getPredicate(0))
        self.assertEquals("olunacaktı", self.sentence7.getPredicate(0))
        self.assertEquals("gerekiyordu", self.sentence8.getPredicate(0))
        self.assertEquals("ediyor", self.sentence9.getPredicate(0))

    def test_ToStems(self):
        self.assertEquals("devasa ölçek yeni kanun kullan karmaşık ve çetrefil dil kavga bulan .", self.sentence0.toStems())
        self.assertEquals("gelir art usul komite gel salı gün kanun tasarı hakkında bir duruşma yap .", self.sentence1.toStems())
        self.assertEquals("reklam ve tanıtım iş yara yara gör üzere .", self.sentence2.toStems())
        self.assertEquals("bu defa , daha da hız hareket et .", self.sentence3.toStems())
        self.assertEquals("shearson lehman hutton ınc. dün öğle sonra kadar yeni tv reklam yaz .", self.sentence4.toStems())
        self.assertEquals("bu kez , firma hazır .", self.sentence5.toStems())
        self.assertEquals("`` diyalog sür kesinlikle temel önem haiz .", self.sentence6.toStems())
        self.assertEquals("cuma gün bu üzerine düşün çok geç kal ol .", self.sentence7.toStems())
        self.assertEquals("bu hakkında önceden düşün gerek . ''", self.sentence8.toStems())
        self.assertEquals("isim göre çeşit göster birkaç kefaret fon reklam yap için devam et .", self.sentence9.toStems())
 def setUp(self) -> None:
     self.sentence0 = AnnotatedSentence(open("../sentences/0000.dev", "r", encoding='utf8'), "../sentences/0000.dev")
     self.sentence1 = AnnotatedSentence(open("../sentences/0001.dev", "r", encoding='utf8'), "../sentences/0001.dev")
     self.sentence2 = AnnotatedSentence(open("../sentences/0002.dev", "r", encoding='utf8'), "../sentences/0002.dev")
     self.sentence3 = AnnotatedSentence(open("../sentences/0003.dev", "r", encoding='utf8'), "../sentences/0003.dev")
     self.sentence4 = AnnotatedSentence(open("../sentences/0004.dev", "r", encoding='utf8'), "../sentences/0004.dev")
     self.sentence5 = AnnotatedSentence(open("../sentences/0005.dev", "r", encoding='utf8'), "../sentences/0005.dev")
     self.sentence6 = AnnotatedSentence(open("../sentences/0006.dev", "r", encoding='utf8'), "../sentences/0006.dev")
     self.sentence7 = AnnotatedSentence(open("../sentences/0007.dev", "r", encoding='utf8'), "../sentences/0007.dev")
     self.sentence8 = AnnotatedSentence(open("../sentences/0008.dev", "r", encoding='utf8'), "../sentences/0008.dev")
     self.sentence9 = AnnotatedSentence(open("../sentences/0009.dev", "r", encoding='utf8'), "../sentences/0009.dev")