Пример #1
0
    def __init__(self, lst_treetagger=None):
        """
        MWEChunker constructor
        :param lst_treetagger: list generated by tree-tagger POS tagging method 
        """
        self.MLE_THR = 0.05
        self._list_tt = lst_treetagger

        self._new_list_tt = []
        self._raw_mwes = []
        self._counter = {}

        self.DICE_THR = 0.065
        # Prepositional phrases
        self._pp_rule_set = [ChunkRule("<IN><NP>", "PrepPHR")]

        # Noun compounds
        # 2-gram rules
        self._nc_2gram_set = [
            ChunkRule("<NN><NN.?>", descr="(SUB(Plur)?)? (SUB(Plur)?)?"),
            ChunkRule("<JJ><NN.?>", descr="ADJ (SUB(Plur)?)?"),
            ChunkRule("<PPH><NN.?>", descr="AAN (SUB(Plur)?)?"),
            ChunkRule("<NN.?><JJ>", descr="(SUB(Plur)?)? ADJ"),
            ChunkRule("<NN.?><PPH>", descr="(SUB(Plur)?)? AAN")
        ]

        # n-gram rules
        self._nc_ngram_set = [
            ChunkRule("(<JJ.?>|<PPH>)+<NN><NN.?>?",
                      descr="(ADJ|PrepPHR)+ SUB SUB?"),
            ChunkRule("<NN><NN.?>(<JJ.?>|<PPH>)*",
                      descr="SUB SUB (ADJ|PrepPHR)*")
        ]
Пример #2
0
    def __init__(self, w_pattern, w_split_left, w_split_right):

        self._pattern = ChunkRule(w_pattern,
                                  'chunk compose clause between conjunction')
        self._split = SplitRule(right_tag_pattern=w_split_right,
                                left_tag_pattern=w_split_left,
                                descr='split the subordinate clause')
Пример #3
0
# Loading Libraries
from nltk.chunk.regexp import ChunkString, ChunkRule, ChinkRule
from nltk.tree import Tree

# ChunkString() starts with the flat tree
tree = Tree('S', [('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'),
                  ('many', 'JJ'), ('chapters', 'NNS')])

# Initializing ChunkString()
chunk_string = ChunkString(tree)
print("Chunk String : ", chunk_string)

# Initializing ChunkRule
chunk_rule = ChunkRule('<DT><NN.*><.*>*<NN.*>', 'chunk determiners and nouns')
chunk_rule.apply(chunk_string)
print("\nApplied ChunkRule : ", chunk_string)

# Another ChinkRule
ir = ChinkRule('<VB.*>', 'chink verbs')
ir.apply(chunk_string)
print("\nApplied ChinkRule : ", chunk_string, "\n")

# Back to chunk sub-tree
chunk_string.to_chunkstruct()
Пример #4
0
s = [('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'), ('many', 'JJ'),
     ('chapters', 'NNS')]
# forth
chunker = RegexpParser(r'''
NP:
    {<DT><NN.*><.*>*<NN.*>}
    }<VB.*>{''')

print(chunker.parse(s))

# back
t = Tree('S', s)
cs = ChunkString(t)
print(cs)

ur = ChunkRule('<DT><NN.*><.*>*<NN.*>', 'chunk determiners and nouns')
ur.apply(cs)
print(cs)

ir = ChinkRule('<VB.*>', 'chink verbs')
ir.apply(cs)
print(cs)

print(cs.to_chunkstruct())
# cs.to_chunkstruct().draw()

chunker = RegexpChunkParser([ur, ir])
print(chunker.parse(t))

# set chunk name
chunker = RegexpChunkParser([ur, ir], chunk_label='CP')
Пример #5
0
    rssproc.summarizetexts('htmltext')

    # filcounts = filter(rssproc.filtercount,counts.values())
    # highestcount = [x for x in filcounts][-1]['count']
    # filcounts = filter(rssproc.filtercount, counts.values())
    # filweights = {}
    # for x in filcounts:
    #     item =x
    #     item['count'] = item['count']/highestcount
    #     filweights[x['word']] = item
    # print([x for x in filweights.values()])
    senttokens = rssproc.senttokenizedtext[1]['htmltext']

    s = 'there are 12 boxes in the closet'

    ur = ChunkRule('<CD>', 'single noun')
    el = ExpandLeftRule('<NNS>', '<CD>', 'get left determiner')
    er = ExpandRightRule('<CD>', '<NNS>', 'get right plural noun')
    un = UnChunkRule('<DT><NN.*>*', 'unchunk everything')

    chunker = RegexpChunkParser([ur, el, er])

    print(chunker.parse(pos_tag(word_tokenize(s))))

    d = []
    for sent in senttokens:
        tk = word_tokenize(sent)
        tkpos = pos_tag(tk)
        for x in tkpos:

            if 'CD' in x:
# Loading Libraries
from nltk.chunk.regexp import ChunkString, ChunkRule, ChinkRule
from nltk.tree import Tree
from nltk.chunk import RegexpChunkParser

# ChunkString() starts with the flat tree
tree = Tree('S', [('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'),
                  ('many', 'JJ'), ('chapters', 'NNS')])

# Initializing ChunkRule
chunk_rule = ChunkRule('<DT><NN.*><.*>*<NN.*>', 'chunk determiners and nouns')

# Another ChinkRule
chink_rule = ChinkRule('<VB.*>', 'chink verbs')

# Applying RegexpChunkParser
chunker = RegexpChunkParser([chunk_rule, chink_rule], chunk_label='CP')
chunker.parse(tree)
Пример #7
0
s = [('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'), ('many', 'JJ'), ('chapters', 'NNS')]
# forth
chunker = RegexpParser(r'''
NP:
    {<DT><NN.*><.*>*<NN.*>}
    }<VB.*>{'''
)

print(chunker.parse(s))

# back
t = Tree('S', s)
cs = ChunkString(t)
print(cs)

ur = ChunkRule('<DT><NN.*><.*>*<NN.*>', 'chunk determiners and nouns')
ur.apply(cs)
print(cs)

ir = ChinkRule('<VB.*>', 'chink verbs')
ir.apply(cs)
print(cs)

print(cs.to_chunkstruct())
# cs.to_chunkstruct().draw()

chunker = RegexpChunkParser([ur, ir])
print(chunker.parse(t))

# set chunk name
chunker = RegexpChunkParser([ur, ir], chunk_label='CP')
Пример #8
0
from nltk import word_tokenize, HunposTagger
from nltk.chunk.regexp import ChunkRule, RegexpChunkParser

from talkytalky.util.util import get_project_root

"""
Rules for making phrases from tokens already labelled with parts of speech
"""
# Grammar source: https://github.com/ICTRC/Parsivar/blob/master/parsivar/chunker.py
PARSIVAR_CHUNK_RULES = [
    ChunkRule('<ADJ_SIM><V_PRS>', 'VP'),
    ChunkRule('<ADJ_INO><V.*>', 'VP'),
    ChunkRule('<V_PRS><N_SING><V_SUB>', 'VP'),
    ChunkRule('<N_SING><ADJ.*><N_SING>', 'NP'),
    ChunkRule('<N.*><PRO>', 'NP'),
    ChunkRule('<N_SING><V_.*>', 'VP'),
    ChunkRule('<V.*>+', 'VP'),
    ChunkRule('<ADJ.*>?<N.*>+ <ADJ.*>?', 'NP'),
    ChunkRule('<DET><NP>', 'DNP'),
    ChunkRule('<ADJ_CMPR><P>', 'PP'),
    ChunkRule('<ADJ_SIM><P>', 'PP'),
    ChunkRule('<P><N_SING>', 'PP'),
    ChunkRule('<P>*', 'PP'),
    ChunkRule('<NP><DNP>', 'DDNP'),
    ChunkRule('<PP><NP>+', 'NPP')
]

# Grammar source: https://github.com/nicolashernandez/PyRATA/blob/master/do_benchmark.py
# Doesn't appear to work for clauses.
PYRATA_CHUNK_RULES = [
    ChunkRule('<DT|JJ|NN.*>+', 'NP'),