예제 #1
0
 def __init__(self):
     """
     :type jTree: jpype._jclass.edu.stanford.nlp.trees.LabeledScoredTreeNode
     :type sfp: StanfordParser
     """
     self._aNodes = slist()
     self._sent_dep_wNodes = slist()
예제 #2
0
    def test_StreamFileReading(self):
        sio = BytesIO()
        expected = slist(slist((i,)) for i in xrange(10))
        expected.dumpToPickle(sio)
        sio = BytesIO(sio.getvalue())

        result = stream.loadFromPickled(sio)
        self.assertEquals(list(expected), list(result))
예제 #3
0
 def __init__(self, syntWord, aDependencies):
     """
     :type syntWord: nlplib.spinlib.SyntWordNode
     :type aDependencies: list[GrammDep]
     """
     assert isinstance(syntWord, SyntWordNode)
     assert isinstance(aDependencies, list)
     self._aGovDeps = slist([dp for dp in aDependencies if dp.gov.position == syntWord.position])
     self._aSlaveDeps = slist([dp for dp in aDependencies if dp.slave.position == syntWord.position])
     self._hDeps = {'gov': defaultdict(list), 'slave': defaultdict(list), }
     for dep, gov, slave in self._aGovDeps:
         self._hDeps['gov'][dep].append(slave)
     ###for
     for dep, gov, slave in self._aSlaveDeps:
         self._hDeps['slave'][dep].append(gov)
예제 #4
0
파일: parser.py 프로젝트: asuiu/pyNLP
    def segment_text(text):
        """
        Segment raw text into sentences using Stanford DocumentProcessor
        :type text: str|unicode
        :rtype: slist[basestring]
        """
        text = text.strip()
        reader = _JClasses.JStringReader(_JClasses.JString(text))
        dp = _JClasses.JDocumentProcessor(reader)
        iterator = dp.iterator()
        sentences = slist()
        while iterator.hasNext():
            sentence_array = next(iterator)
            tokens = []
            for idx in range(sentence_array.size()):
                token = sentence_array[idx].toString()

                old_token = ""
                while old_token != token:
                    old_token = token
                    token = token.replace("\\", "")
                    token = token.replace("\\/", "/")
                tokens.append(token)
            sentences.append(' '.join(tokens))
        return sentences
예제 #5
0
파일: parser.py 프로젝트: asuiu/pyNLP
    def segment_text(text):
        """
        Segment raw text into sentences using Stanford DocumentProcessor
        :type text: str|unicode
        :rtype: slist[basestring]
        """
        text = text.strip()
        reader = _JClasses.JStringReader(_JClasses.JString(text))
        dp = _JClasses.JDocumentProcessor(reader)
        iterator = dp.iterator()
        sentences = slist()
        while iterator.hasNext():
            sentence_array = iterator.next()
            tokens = []
            for idx in range(sentence_array.size()):
                token = sentence_array[idx].toString()

                old_token = ""
                while old_token != token:
                    old_token = token
                    token = token.replace("\\", "")
                    token = token.replace("\\/", "/")
                tokens.append(token)
            sentences.append(' '.join(tokens))
        return sentences
예제 #6
0
 def tokenizeText(self, text: str) -> slist:
     """
     Uses a sentence tokenizer, and tokenize obtained sentences with a TreeBank tokenizer.
     Replace unnormal quotes.
     """
     sentences = self.__tokenizeToSentences(text)
     tokens = slist()
     for sent in sentences:
         sent = creReplaceNLs.sub(r' ', sent)
         tokens.extend(
             self._tb_tokenizer(sent)
         )  # Tokenize sentences using TreeBank tokenizer initialized upper in the __init__ function
     return tokens
예제 #7
0
 def getEqDeps(self, depType, bGov=True):
     """
     :type depType: DepType
     :type bGov: bool
     :rtype : slist[  SyntWordNode ] | None
     """
     dtype = ('slave', 'gov')[bGov]
     hDeps = self._hDeps[dtype]
     
     aRet = slist()
     for t in depType.equivalent_names:
         if t in hDeps:
             aRet.extend(hDeps[t])
     return aRet or None
예제 #8
0
파일: tokenizers.py 프로젝트: asuiu/pyNLP
 def tokenizeText(self, text):
     """
     Uses a sentence tokenizer, and tokenize obtained sentences with a TreeBank tokenizer.
     Replace unnormal quotes.
     :param text:
     :type text: str | unicode
     :rtype: slist
     """
     sentences = self.__tokenizeToSentences(text)
     tokens = slist()
     for sent in sentences:
         sent = creReplaceNLs.sub(r' ', sent)
         tokens.extend(self._tb_tokenizer(
                 sent))  # Tokenize sentences using TreeBank tokenizer initialized upper in the __init__ function
     return tokens
예제 #9
0
 def getTaggedText(self) -> slist[Union[SyntToken, SyntWordToken]]:
     aText = slist()
     for i, token in enumerate(self._tokens):
         tag = CTags.fromString(token.tag_)
         if CTags.isWordType(tag):
             sn = SyntWordToken(text=token.text,
                                tag=tag,
                                stemmed=token.lemma_,
                                idx=i)
         else:
             if token.is_punct or token.pos_ in ("SYM", "PUNCT"):
                 tag = CTags.SYM
             elif tag in (CTags.CD, CTags.SYM, CTags.LS, CTags.UH):
                 pass
             else:
                 tag = CTags.fromString("OTHER_TAG")
             sn = SyntToken(text=token.text, tag=tag, idx=i)
         aText.append(sn)
     return aText
예제 #10
0
    def tokenizeText(self, text: str) -> List[SpacySentence]:
        """
        Uses a sentence tokenizer, and tokenize obtained sentences with Spacy tokenizer.
        Replace unnormal quotes.
        """
        text = re.sub(r'[`\x92\x91]', r"'", text)
        text = re.sub(r'[\x93\x94\x95\x96\x85\xE9]', r'"', text)
        text = re.sub(r'[\x80-\xFF]', r' ', text)
        text = re.sub(r"([:\s])\'(.+?)\'([\s\.])", r'\1"\2"\3', text)
        text = re.sub(r"\s+", r' ', text)
        text = text.strip()

        doc: Doc = self._model(text)
        tokenized = slist()
        sent: Span
        for sent in doc.sents:
            ss = SpacySentence(list(sent), sent.text)
            tokenized.append(ss)
        return tokenized
예제 #11
0
 def test_reduceUsesInitProperly(self):
     self.assertEquals(slist([sset((1, 2)), sset((3, 4))]).reduce(lambda x, y: x.update(y)), set((1, 2, 3, 4)))
     self.assertEquals(slist([sset((1, 2)), sset((3, 4))]).reduce(lambda x, y: x.update(y), sset()),
                       set((1, 2, 3, 4)))
예제 #12
0
파일: parser.py 프로젝트: asuiu/pyNLP
    def iterativeParse_jTree(self, jTree, sfp):
        """
        Parses the Tree object from JVM into Python VM
        Returns tuple (tree, dictionary of WordPositions)
        :type sfp: StanfordParser
        :type jTree: jpype._jclass.edu.stanford.nlp.trees.LabeledScoredTreeNode
        :rtype :  PhraseNode, slist[SyntWordNode]
        """
        level = 0
        tree_index = 0
        ptree = self._constructPhraseNode(CTags.fromString(str(jTree.value())),
                                          level, jTree, self, tree_index)
        """:type : CPhraseNode"""
        level += 1  # only root is level 0
        tree_index += 1

        q = []
        for c in jTree.children():
            q.append((c, ptree, level))
        q = list(reversed(q))

        hWordsPositions = {}
        while len(q):
            node = q.pop()
            wh_add = node[1]
            level = node[2]
            node = node[0]
            if node.isLeaf():
                raise ValueError("Malformed syntactic tree: unexpected leaf")
            elif node.isPreTerminal():
                tag = str(node.value())
                w_label = node.children()[0].label()
                v = w_label.value()
                word = re.sub(r'[\x80-\xFF]', r"_Unknown_char_", v)
                wtag = sfp.stem(word, tag)
                _wrd = wtag.word()
                if _wrd is None:
                    raise Exception(
                        "wtag returned by stemmer returns NULL word() for word %s and tag %s: %s %s"
                        % (word, tag, str(wtag), str(wtag.__class__)))

                if wtag.word().lower() in STEM_EXCEPTIONS:
                    stemmed = STEM_EXCEPTIONS[wtag.word().lower()]
                else:
                    stemmed = wtag.word().lower()

                w_pos = (int(w_label.beginPosition()),
                         int(w_label.endPosition()))
                nt = self._constructWordNode(word, CTags.fromString(tag),
                                             stemmed, level, w_pos, node, self,
                                             tree_index, wh_add)
                tree_index += 1
                hWordsPositions[int(w_label.beginPosition())] = nt
                wh_add.addChild(nt)
            else:
                nt = self._constructPhraseNode(
                    CTags.fromString(str(node.value())), level, node, self,
                    tree_index, wh_add)
                tree_index += 1
                wh_add.addChild(nt)
                level += 1
                chldren = []
                for c in node.children():
                    chldren.append((c, nt, level))
                q.extend(reversed(chldren))
        wNodes = slist(hWordsPositions[p]
                       for p in sorted(hWordsPositions.keys()))
        """:type : list[SyntWordNode]"""
        for i in range(len(wNodes)):
            wNodes[i].set_position(i)

        return ptree, wNodes
 def _getNextBuffer(self):
     self._counter -= 1
     if self._counter > 0:
         return slist(range(self._counter))
     return slist()
예제 #14
0
 def testStreamList(self):
     l = lambda: slist((1, 2, 3))
     self.assertEqual(l().toList(), [1, 2, 3])
     self.assertEqual(l()[-1], 3)
 def test_nominal(self):
     s = SynchronizedBufferedStream((slist(xrange(i)) for i in xrange(1, 4)))
     self.assertListEqual(s.toList(), [0, 0, 1, 0, 1, 2])
예제 #16
0
 def test_slist_repr_nominal(self):
     l = [1, 2, 3]
     s = slist(l)
     self.assertEquals(repr(s), repr(l))
예제 #17
0
 def test_slist_str_nominal(self):
     l = [1, 2, 3]
     s = slist(l)
     s1 = str(s)
     self.assertEquals(str(s), str(l))
예제 #18
0
 def test_reversedNominal(self):
     s = slist([1, 2, 3])
     self.assertListEqual(s.reversed().toList(), [3, 2, 1])
예제 #19
0
파일: parser.py 프로젝트: asuiu/pyNLP
    def iterativeParse_jTree(self, jTree, sfp):
        """
        Parses the Tree object from JVM into Python VM
        Returns tuple (tree, dictionary of WordPositions)
        :type sfp: StanfordParser
        :type jTree: jpype._jclass.edu.stanford.nlp.trees.LabeledScoredTreeNode
        :rtype :  PhraseNode, slist[SyntWordNode]
        """
        level = 0
        tree_index = 0
        ptree = self._constructPhraseNode(CTags.fromString(str(jTree.value())), level, jTree, self, tree_index)
        """:type : CPhraseNode"""
        level += 1  # only root is level 0
        tree_index += 1

        q = []
        for c in jTree.children():
            q.append((c, ptree, level))
        q = list(reversed(q))

        hWordsPositions = {}
        while len(q):
            node = q.pop()
            wh_add = node[1]
            level = node[2]
            node = node[0]
            if node.isLeaf():
                raise ValueError("Malformed syntactic tree: unexpected leaf")
            elif node.isPreTerminal():
                tag = str(node.value())
                w_label = node.children()[0].label()
                v = w_label.value()
                word = re.sub(r'[\x80-\xFF]', r"_Unknown_char_", v)
                wtag = sfp.stem(word, tag)
                _wrd = wtag.word()
                if _wrd is None:
                    raise Exception("wtag returned by stemmer returns NULL word() for word %s and tag %s: %s %s" % (
                        word, tag, str(wtag), str(wtag.__class__)))

                if wtag.word().lower() in STEM_EXCEPTIONS:
                    stemmed = STEM_EXCEPTIONS[wtag.word().lower()]
                else:
                    stemmed = wtag.word().lower()

                w_pos = (int(w_label.beginPosition()), int(w_label.endPosition()))
                nt = self._constructWordNode(word, CTags.fromString(tag), stemmed, level, w_pos, node, self, tree_index,
                                             wh_add)
                tree_index += 1
                hWordsPositions[int(w_label.beginPosition())] = nt
                wh_add.addChild(nt)
            else:
                nt = self._constructPhraseNode(CTags.fromString(str(node.value())), level, node, self, tree_index,
                                               wh_add)
                tree_index += 1
                wh_add.addChild(nt)
                level += 1
                chldren = []
                for c in node.children():
                    chldren.append((c, nt, level))
                q.extend(reversed(chldren))
        wNodes = slist(hWordsPositions[p] for p in sorted(hWordsPositions.keys()))
        """:type : list[SyntWordNode]"""
        for i in xrange(len(wNodes)):
            wNodes[i].set_position(i)

        return ptree, wNodes
예제 #20
0
 def test_flatMap_defaultIdentityFunction(self):
     l = slist(({1: 2, 3: 4}, {5: 6, 7: 8}))
     self.assertEquals(l.flatMap().toSet(), set((1, 3, 5, 7)))