def __init__(self, lst_treetagger=None): """ MWEChunker constructor :param lst_treetagger: list generated by tree-tagger POS tagging method """ self.MLE_THR = 0.05 self._list_tt = lst_treetagger self._new_list_tt = [] self._raw_mwes = [] self._counter = {} self.DICE_THR = 0.065 # Prepositional phrases self._pp_rule_set = [ChunkRule("<IN><NP>", "PrepPHR")] # Noun compounds # 2-gram rules self._nc_2gram_set = [ ChunkRule("<NN><NN.?>", descr="(SUB(Plur)?)? (SUB(Plur)?)?"), ChunkRule("<JJ><NN.?>", descr="ADJ (SUB(Plur)?)?"), ChunkRule("<PPH><NN.?>", descr="AAN (SUB(Plur)?)?"), ChunkRule("<NN.?><JJ>", descr="(SUB(Plur)?)? ADJ"), ChunkRule("<NN.?><PPH>", descr="(SUB(Plur)?)? AAN") ] # n-gram rules self._nc_ngram_set = [ ChunkRule("(<JJ.?>|<PPH>)+<NN><NN.?>?", descr="(ADJ|PrepPHR)+ SUB SUB?"), ChunkRule("<NN><NN.?>(<JJ.?>|<PPH>)*", descr="SUB SUB (ADJ|PrepPHR)*") ]
def __init__(self, w_pattern, w_split_left, w_split_right): self._pattern = ChunkRule(w_pattern, 'chunk compose clause between conjunction') self._split = SplitRule(right_tag_pattern=w_split_right, left_tag_pattern=w_split_left, descr='split the subordinate clause')
# Loading Libraries from nltk.chunk.regexp import ChunkString, ChunkRule, ChinkRule from nltk.tree import Tree # ChunkString() starts with the flat tree tree = Tree('S', [('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'), ('many', 'JJ'), ('chapters', 'NNS')]) # Initializing ChunkString() chunk_string = ChunkString(tree) print("Chunk String : ", chunk_string) # Initializing ChunkRule chunk_rule = ChunkRule('<DT><NN.*><.*>*<NN.*>', 'chunk determiners and nouns') chunk_rule.apply(chunk_string) print("\nApplied ChunkRule : ", chunk_string) # Another ChinkRule ir = ChinkRule('<VB.*>', 'chink verbs') ir.apply(chunk_string) print("\nApplied ChinkRule : ", chunk_string, "\n") # Back to chunk sub-tree chunk_string.to_chunkstruct()
s = [('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'), ('many', 'JJ'), ('chapters', 'NNS')] # forth chunker = RegexpParser(r''' NP: {<DT><NN.*><.*>*<NN.*>} }<VB.*>{''') print(chunker.parse(s)) # back t = Tree('S', s) cs = ChunkString(t) print(cs) ur = ChunkRule('<DT><NN.*><.*>*<NN.*>', 'chunk determiners and nouns') ur.apply(cs) print(cs) ir = ChinkRule('<VB.*>', 'chink verbs') ir.apply(cs) print(cs) print(cs.to_chunkstruct()) # cs.to_chunkstruct().draw() chunker = RegexpChunkParser([ur, ir]) print(chunker.parse(t)) # set chunk name chunker = RegexpChunkParser([ur, ir], chunk_label='CP')
rssproc.summarizetexts('htmltext') # filcounts = filter(rssproc.filtercount,counts.values()) # highestcount = [x for x in filcounts][-1]['count'] # filcounts = filter(rssproc.filtercount, counts.values()) # filweights = {} # for x in filcounts: # item =x # item['count'] = item['count']/highestcount # filweights[x['word']] = item # print([x for x in filweights.values()]) senttokens = rssproc.senttokenizedtext[1]['htmltext'] s = 'there are 12 boxes in the closet' ur = ChunkRule('<CD>', 'single noun') el = ExpandLeftRule('<NNS>', '<CD>', 'get left determiner') er = ExpandRightRule('<CD>', '<NNS>', 'get right plural noun') un = UnChunkRule('<DT><NN.*>*', 'unchunk everything') chunker = RegexpChunkParser([ur, el, er]) print(chunker.parse(pos_tag(word_tokenize(s)))) d = [] for sent in senttokens: tk = word_tokenize(sent) tkpos = pos_tag(tk) for x in tkpos: if 'CD' in x:
# Loading Libraries from nltk.chunk.regexp import ChunkString, ChunkRule, ChinkRule from nltk.tree import Tree from nltk.chunk import RegexpChunkParser # ChunkString() starts with the flat tree tree = Tree('S', [('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'), ('many', 'JJ'), ('chapters', 'NNS')]) # Initializing ChunkRule chunk_rule = ChunkRule('<DT><NN.*><.*>*<NN.*>', 'chunk determiners and nouns') # Another ChinkRule chink_rule = ChinkRule('<VB.*>', 'chink verbs') # Applying RegexpChunkParser chunker = RegexpChunkParser([chunk_rule, chink_rule], chunk_label='CP') chunker.parse(tree)
s = [('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'), ('many', 'JJ'), ('chapters', 'NNS')] # forth chunker = RegexpParser(r''' NP: {<DT><NN.*><.*>*<NN.*>} }<VB.*>{''' ) print(chunker.parse(s)) # back t = Tree('S', s) cs = ChunkString(t) print(cs) ur = ChunkRule('<DT><NN.*><.*>*<NN.*>', 'chunk determiners and nouns') ur.apply(cs) print(cs) ir = ChinkRule('<VB.*>', 'chink verbs') ir.apply(cs) print(cs) print(cs.to_chunkstruct()) # cs.to_chunkstruct().draw() chunker = RegexpChunkParser([ur, ir]) print(chunker.parse(t)) # set chunk name chunker = RegexpChunkParser([ur, ir], chunk_label='CP')
from nltk import word_tokenize, HunposTagger from nltk.chunk.regexp import ChunkRule, RegexpChunkParser from talkytalky.util.util import get_project_root """ Rules for making phrases from tokens already labelled with parts of speech """ # Grammar source: https://github.com/ICTRC/Parsivar/blob/master/parsivar/chunker.py PARSIVAR_CHUNK_RULES = [ ChunkRule('<ADJ_SIM><V_PRS>', 'VP'), ChunkRule('<ADJ_INO><V.*>', 'VP'), ChunkRule('<V_PRS><N_SING><V_SUB>', 'VP'), ChunkRule('<N_SING><ADJ.*><N_SING>', 'NP'), ChunkRule('<N.*><PRO>', 'NP'), ChunkRule('<N_SING><V_.*>', 'VP'), ChunkRule('<V.*>+', 'VP'), ChunkRule('<ADJ.*>?<N.*>+ <ADJ.*>?', 'NP'), ChunkRule('<DET><NP>', 'DNP'), ChunkRule('<ADJ_CMPR><P>', 'PP'), ChunkRule('<ADJ_SIM><P>', 'PP'), ChunkRule('<P><N_SING>', 'PP'), ChunkRule('<P>*', 'PP'), ChunkRule('<NP><DNP>', 'DDNP'), ChunkRule('<PP><NP>+', 'NPP') ] # Grammar source: https://github.com/nicolashernandez/PyRATA/blob/master/do_benchmark.py # Doesn't appear to work for clauses. PYRATA_CHUNK_RULES = [ ChunkRule('<DT|JJ|NN.*>+', 'NP'),