def compress_grammar(grammar): comp_dict = get_compress_dict(grammar) grammar._productions = list(filter(lambda p: p.lhs() not in comp_dict, grammar._productions)) for ind, prod in enumerate(grammar._productions): rhs = [r if r not in comp_dict else comp_dict[r] for r in prod.rhs()] new_prod = ProbabilisticProduction(prod.lhs(), rhs) new_prod.prob = prod.prob grammar._productions[ind] = new_prod return grammar
def create_duplications(grammar, dup_prob): dup_prods = [] for ind, prod in enumerate(grammar._productions): rhs = prod.rhs() if len(rhs) == 1 and type(rhs[0]) is str: if prod.prob() != 1.0: raise BaseException("Can't handle this currently") new_prod = ProbabilisticProduction(prod.lhs(), rhs, prob=1-dup_prob) grammar._productions[ind] = new_prod dup_prod = ProbabilisticProduction(prod.lhs(), [rhs[0], prod.lhs()], prob=dup_prob) dup_prods.append(dup_prod) for dup_prod in dup_prods: grammar._productions.append(dup_prod) return grammar
def test_productions(self): t = Tree.fromstring(""" (S (NP (Det el) (Noun gato)) (VP (Verb come) (NP (Noun pescado) (Adj crudo))) ) """) # Bugfix from official test (, start='S') model = UPCFG([t], start='S') prods = model.productions() prods2 = [ ProbabilisticProduction(N('S'), [N('NP'), N('VP')], prob=1.0), ProbabilisticProduction(N('NP'), [N('Det'), N('Noun')], prob=0.5), ProbabilisticProduction(N('Det'), ['Det'], prob=1.0), ProbabilisticProduction(N('Noun'), ['Noun'], prob=1.0), ProbabilisticProduction(N('VP'), [N('Verb'), N('NP')], prob=1.0), ProbabilisticProduction(N('Verb'), ['Verb'], prob=1.0), ProbabilisticProduction(N('NP'), [N('Noun'), N('Adj')], prob=0.5), ProbabilisticProduction(N('Adj'), ['Adj'], prob=1.0), ] self.assertEqual(set(prods), set(prods2))
def _learning_by_biclustering(G, C, T): print("learning...") global biclusters global ignore_mc_ec ## find the valid bicluster Bc in T that leads to the maximal posterior gain (Eq.2) BC = None ## 1er essai attempts = 3 while BC is None and attempts > 0: attempts -= 1 BC = _get_best_bicluster(T, C) if BC is None: ignore_mc_ec = True ## 2e essai attempts = 2 while BC is None and attempts > 0: attempts -= 1 BC = _get_best_bicluster(T, C) if BC is None: return False, G, C, T, None ignore_mc_ec = False ## create an AND symbol N and two OR symbols A, B N = Nonterminal("_AND_"+str(_get_and_symb_index())) A = Nonterminal("_OR_"+str(_get_or_symb_index())) B = Nonterminal("_OR_"+str(_get_or_symb_index())) bc = BC.as_matrix() s = np.sum(bc) row_prob = np.sum(bc, 1)/s col_prob = np.sum(bc, 0)/s ## création des règles rules = [] rules += [ProbabilisticProduction(A, [_format_nt(BC.index[i])], prob=row_prob[i]) for i in range(BC.shape[0])] rules += [ProbabilisticProduction(B, [_format_nt(BC.columns[j])], prob=col_prob[j]) for j in range(BC.shape[1])] rules += [ProbabilisticProduction(N, [A, B], prob=1.)] ## mises à jour G_updated = PCFG(G.start(), G.productions() + rules) # ajout des règles dans G C_reduced = _reduce_corpus(C, BC, N) # réduction du corpus T_updated = _create_t(C_reduced) # mise à jour de T biclusters[(N.symbol(),A.symbol(),B.symbol())] = BC # sauvegarde de BC pour le groupe appris return True, G_updated, C_reduced, T_updated, N
def pcfg_generate(grammar): def non_terminal_into_terminal(non_terminal): nt_productions = grammar.productions(Nonterminal(str(non_terminal))) my_dict = dict() for pr in nt_productions: my_dict[pr.rhs()] = pr.prob() nt_productions_probDist = DictionaryProbDist(my_dict) genereted = nt_productions_probDist.generate() return list(genereted) def nts_into_ts(genereted_nts): for index in range(len(genereted_nts)): old_nt = genereted_nts[index] try: t = non_terminal_into_terminal(genereted_nts[index]) except Exception as e: continue productions_corpus.append(ProbabilisticProduction(Nonterminal(old_nt), tuple(t), **{'prob': 0})) genereted_nts[index] = nts_into_ts(Tree(old_nt, t)) return genereted_nts productions = grammar.productions() dic = dict() for pr in productions: dic[pr.rhs()] = pr.prob() productions_probDist = DictionaryProbDist(dic) genereted = productions_probDist.generate() productions_corpus.append(ProbabilisticProduction(Nonterminal('S'), genereted, **{'prob': 0})) genereted = Tree('S', [genereted[0], genereted[1]]) return nts_into_ts(genereted)
def parse(self, tokens): tagged = nltk.pos_tag(tokens) missing = False for tok, pos in tagged: if not self._grammar._lexical_index.get(tok): missing = True self._grammar._productions.append( ProbabilisticProduction(Nonterminal(pos), [tok], prob=0.000001)) # WeightedProduction(Nonterminal(pos), [tok], prob=0.000001)) if missing: self._grammar._calculate_indexes() # returns a generator, so call 'next' to get the ProbabilisticTree tree = super(PCFGViterbiParser, self).parse(tokens) if issubclass(tree.__class__, nltk.tree.Tree): print 'returning a tree' return tree elif isinstance(tree, types.GeneratorType): try: return next(tree) except (StopIteration): tweet = ' '.join(tokens) print u'Couldn\'t parse {}'.format(tweet) return None else: error("Type of tree is: {}".format(type(tree)))
def main(args): sentence = args.sentence.lower() args.sentence = sentence tokens = sentence.split() grammar = loadGrammar(args) nonterm = getnonterm(grammar) terminalProductionRules = getTerminalProbability(args, grammar, nonterm) HSrules = grammar.productions(Nonterminal('HS')) for rule in HSrules: grammar.productions().remove(rule) ESrules = grammar.productions(Nonterminal('ES')) for rule in ESrules: grammar.productions().remove(rule) grammar.productions().extend(terminalProductionRules) for token in tokens: grammar.productions().append( ProbabilisticProduction(Nonterminal(token.upper()), [unicode(token)], prob=1)) #print "Grammars" grammarlist = str(grammar).split('\n')[1:] #print "Transfered" strgrammar = '' for p in grammar.productions(): rhs = p.rhs() rhsstr = '' for r in rhs: if is_terminal(r): rhsstr += '\'' + str(r) + '\' ' else: rhsstr += str(r) + ' ' strgrammar += str(p.lhs()) + ' -> ' + rhsstr + ' [' + '{0:.8f}'.format( p.prob()) + ']\n' #print strgrammar grammar = PCFG.fromstring(strgrammar.split('\n')) #''' #grammar = loadGrammar(args) #tokens = args.sentence.lower().split() #nonterm = getnonterm(grammar) CYK(tokens, nonterm, grammar) #with open(args.grammar_file, 'r') as f: # content = f.read() #trees = corpus2trees(content) #productions = trees2productions(trees) #listnonterm = [] #grammar = nltk.grammar.induce_pcfg(nltk.grammar.Nonterminal('SS'), productions) #print grammar #''' '''
def pcfg_bcl(C, alpha=ALPHA, gd_thr=LPG_DIFF_THRESHOLD, mc_thr=MC_THRESHOLD): print("\ninitializing...") global ALPHA global LPG_DIFF_THRESHOLD global MC_THRESHOLD global and_symb_count global or_symb_count global ignore_mc_ec ALPHA = alpha LPG_DIFF_THRESHOLD = gd_thr MC_THRESHOLD = mc_thr and_symb_count = 0 or_symb_count = 0 ignore_mc_ec = False ## create an empty grammar G S = Nonterminal("_START_") R = [ProbabilisticProduction(S, [""], prob=1.)] G = PCFG(S, R) T = _create_t(C) # create a table T ## repeat until no further rule to be learned i = 0 while not _finished(T): i += 1 print("\niter. n° %d" % (i,)) found, G, C, T, N = _learning_by_biclustering(G, C, T) if not found: print("NO MORE RULES CAN BE LEARNED") break G, C, T = _attaching(N, G, C, T) G = _postprocessing(G, C) print("\n", G) # DEBUG return G
def get_productions(productions): probabilities = dict() productions_to_return = list(set(productions)) for prod in productions: if str(prod) in probabilities: probabilities[str(prod)] += 1 else: probabilities[str(prod)] = 1 amount_of_interior_nodes = len([prod.lhs() for prod in productions if prod.lhs() != Nonterminal('S')]) lhs_of_prods = set([prod.lhs() for prod in productions]) print('this is the amount of interior nodes: {}'.format(amount_of_interior_nodes)) for lhs in lhs_of_prods: number_of_occurrences = 0 for prob in probabilities: if prob.startswith(str(lhs) + " "): number_of_occurrences += probabilities[prob] for prob in probabilities: if prob.startswith(str(lhs) + " "): probabilities[prob] = probabilities[prob] / number_of_occurrences for index in range(len(productions_to_return)): prod = productions_to_return[index] productions_to_return[index] = ProbabilisticProduction(prod.lhs(), prod.rhs(), **{'prob': probabilities[str(prod)]}) return productions_to_return
def get_productions(productions): probabilities = dict() productions_to_return = list(set(productions)) for prod in productions: if str(prod) in probabilities: probabilities[str(prod)] += 1 else: probabilities[str(prod)] = 1 lhs_of_prods = set([prod.lhs() for prod in productions]) for lhs in lhs_of_prods: number_of_occurrences = 0 for prob in probabilities: if prob.startswith(str(lhs) + " "): number_of_occurrences += probabilities[prob] for prob in probabilities: if prob.startswith(str(lhs) + " "): probabilities[ prob] = probabilities[prob] / number_of_occurrences for index in range(len(productions_to_return)): prod = productions_to_return[index] productions_to_return[index] = ProbabilisticProduction( prod.lhs(), prod.rhs(), **{'prob': probabilities[str(prod)]}) dist = FreqDist(productions_to_return) #dist.plot(len(probabilities)) return productions_to_return, dist
def __init__(self, parsed_sents, start='sentence', horzMarkov=None): """ parsed_sents -- list of training trees. start -- start symbol. horzMarkov -- None for default. A number n >= 0 for horizontal markov. """ self.start = start count_Y_Z = defaultdict(lambda: defaultdict(int)) count_X = defaultdict(int) for t in parsed_sents: # it's a copy of tree. We don't want to modify the original tree. # mutable structures unle_trees = unlexicalize(t.copy(deep=True)) # chomsky normal form with horizontal markov. unle_trees.chomsky_normal_form(horzMarkov=horzMarkov) # collapse subtrees with a single child. unle_trees.collapse_unary(collapsePOS=True) for prod in unle_trees.productions(): count_Y_Z[prod.lhs()][prod.rhs()] += 1 count_X[prod.lhs()] += 1 # create a list of productions. productions = [] for X, c_X in count_X.items(): for (Y_Z, c_Y_Z) in count_Y_Z[X].items(): q = c_Y_Z / float(c_X) productions.append(ProbabilisticProduction(X, Y_Z, prob=q)) self.production = productions grammar = PCFG(Nonterminal(start), productions) self.parser = CKYParser(grammar)
def nts_into_ts(genereted_nts): for index in range(len(genereted_nts)): old_nt = genereted_nts[index] try: t = non_terminal_into_terminal(genereted_nts[index]) except Exception as e: continue productions_corpus.append(ProbabilisticProduction(Nonterminal(old_nt), tuple(t), **{'prob': 0})) genereted_nts[index] = nts_into_ts(Tree(old_nt, t)) return genereted_nts
def test_horz_markov_0(self): t = Tree.fromstring("(NP (Det el) (Noun gato) (Adj negro))") model = UPCFG([t], horzMarkov=0) prods = model.productions() prods2 = [ # the right-binarized productions: ProbabilisticProduction(N('NP'), [N('Det'), N('NP|<>')], prob=1.0), ProbabilisticProduction(N('NP|<>'), [N('Noun'), N('Adj')], prob=1.0), ProbabilisticProduction(N('Det'), ['Det'], prob=1.0), ProbabilisticProduction(N('Noun'), ['Noun'], prob=1.0), ProbabilisticProduction(N('Adj'), ['Adj'], prob=1.0), ] self.assertEqual(set(prods), set(prods2))
def parse_batch(self, tagged): missing = False tokens = [] for tok, pos in tagged: tokens.append(tok) if not self._grammar._lexical_index.get(tok): missing = True self._grammar._productions.append(ProbabilisticProduction(Nonterminal(pos), [tok], prob=0.000001)) if missing: self._grammar._calculate_indexes() return super(PCFGViterbiParser, self).parse(tokens)
def test_horz_markov_None(self): t = Tree.fromstring("(NP (Det el) (Noun gato) (Adj negro))") # Bugfix from official test (, start='NP') model = UPCFG([t], start='NP') # horzMarkov=None by default prods = model.productions() prods2 = [ # the right-binarized productions: ProbabilisticProduction(N('NP'), [N('Det'), N('NP|<Noun-Adj>')], prob=1.0), ProbabilisticProduction(N('NP|<Noun-Adj>'), [N('Noun'), N('Adj')], prob=1.0), ProbabilisticProduction(N('Det'), ['Det'], prob=1.0), ProbabilisticProduction(N('Noun'), ['Noun'], prob=1.0), ProbabilisticProduction(N('Adj'), ['Adj'], prob=1.0), ] self.assertEqual(set(prods), set(prods2))
def __init__(self, parsed_sents, start='sentence', horzMarkov=None): """ parsed_sents -- list of training trees. """ # { A -> B : count(A -> B) } productions_counts = defaultdict(int) # { A : count(A) } lhs_count = defaultdict(int) # left_hand_side_count self.start = start # Para la gramatica del parser CKY self.prods = [] # Lista de producciones # Hacemos una copia de t porque al hacer el unlexicalize, este me # modifica el arbol # Original: unlexicalize_tree = [unlexicalize(t) for t in parsed_sents] unlex_sents = [unlexicalize(t.copy(deep=True)) for t in parsed_sents] for t in unlex_sents: t.chomsky_normal_form(horzMarkov=horzMarkov) t.collapse_unary(collapsePOS=True, collapseRoot=True) for prod in t.productions(): # type(prod): <class 'nltk.grammar.Production'> # type(prod.lhs): <class 'nltk.grammar.Nonterminal'> # type(prod.rhs): <class 'tuple'> # Cada elemento de prod.rhs() es del tipo: # <class 'nltk.grammar.Nonterminal'> productions_counts[prod] += 1 lhs_count[prod.lhs()] += 1 for prod, count_prod in productions_counts.items(): # type(production): <class 'nltk.grammar.Production'> # production : A -> B # type(count_prod): int # count_prod : count(A -> B) count_lhs = lhs_count.get(prod.lhs(), 0) # type(prod.lhs): <class 'nltk.grammar.Nonterminal'> # type(prod.rhs): <class 'tuple'> q_ML = float(count_prod) / count_lhs self.prods += [ProbabilisticProduction(prod.lhs(), prod.rhs(), prob=q_ML)] # Cada elemento de self.prods es del tipo: # <class 'nltk.grammar.ProbabilisticProduction'> # type(PCFG(...)) = <class 'nltk.grammar.PCFG'> # PCFG(start, productions) # type(start): Nonterminal # type(productions): list(Production) grammar = PCFG(Nonterminal(start), self.prods) self.my_parser = CKYParser(grammar)
def read_productions(self, productions_filename): productions = [] with io.open(productions_filename, 'r', encoding='utf8') as f: for line in f: line = line.strip() components = line.split(u'+') lhs = Nonterminal(components[0]) rhs = tuple([ Nonterminal(nt.strip()) for nt in components[1].split(u' ') ]) prob = float(components[2]) pp = ProbabilisticProduction(lhs, rhs, prob=prob) productions.append(pp) self.grammar = PCFG(Nonterminal('S'), productions)
def parse(self, tokens, tagger = None): # tokens = self._preprocess(list(tokens)) if (tagger == None): tagged = nltk.pos_tag(tokens) else: tagged = tagger.tag(tokens) # print tagged missing = False for tok, pos in tagged: if not self._grammar._lexical_index.get(tok): missing = True self._grammar._productions.append(ProbabilisticProduction(Nonterminal(pos), [tok], prob=0.000001)) if missing: self._grammar._calculate_indexes() return super(PCFGViterbiParser, self).parse(tokens)
def parse(self, tokens): tokens = self._preprocess(list(tokens)) tagged = nltk.pos_tag(tokens) missing = False for tok, pos in tagged: if not self._grammar._lexical_index.get(tok): missing = True self._grammar._productions.append( ProbabilisticProduction(Nonterminal(pos), [tok], prob=0.000001)) if missing: self._grammar._calculate_indexes() print 'HI' testlist = super(PCFGViterbiParser, self).parse(tokens) for test in testlist: test.draw() return super(PCFGViterbiParser, self).parse(tokens)
def parse(self, tokens): #tokens = self._preprocess(list(tokens)) tagged = nltk.pos_tag(tokens) # tagged = tokens # print(tagged) # tokens = [i[0] for i in tagged] # print("TOOOOKKENNSS-------------") # print(tokens) missing = False for tok, pos in tagged: if not self._grammar._lexical_index.get(tok): missing = True self._grammar._productions.append( ProbabilisticProduction(Nonterminal(pos), [tok], prob=0.000001)) if missing: self._grammar._calculate_indexes() print(self._grammar) return super(PCFGViterbiParser, self).parse(tokens)
def _postprocessing(G, C): print("\npostprocessing...") ## suppression de la règle _START_ -> ... rules = [] for prod in G.productions(): if G.start().symbol() not in prod.lhs().symbol(): rules.append(prod) if len(rules) == 0: return G ## create an OR symbol S S = Nonterminal("_START_") sss = {} # single symbol sentences ## for each sentence s in C do ## if s is fully reduced to a single symbol x then ## add S -> x to G, or if the rule already exists, increase its weight by 1 for sentence in sent_tokenize(C): sentence = re.sub(r'[^\w\s]', '', sentence) t = word_tokenize(sentence) if len(t) == 1: sss[t[0]] = 1 if not t[0] in sss else sss[t[0]] + 1 weight_sum = sum([sss[k] for k in sss]) rules += [ProbabilisticProduction(S, [_format_nt(k)], prob=sss[k]/weight_sum) for k in sss] return PCFG(S, rules)
def renormalize(self, height=10**4, tol=10**(-17), min_height=100): """Return renormalized grammar. Raise ValueError if for at least one nonterminal, its coverage equals zero. Input: height - maximal height of parse trees of which the coverage is calculated of. tol - tolerance as a stopping condition. If change is smaller than the input tolerance, then it stops. min_height - overrides tolerance stopping condition and calculates coverage of all heights <= min_height. It also determines for how many previous steps the change is measured, i.e. for levels (height-1 - min_height/2). verbosity - if set to > 0, it prints stopping probability change, height and input tolerance. """ coverages_dict = self.list_coverages(height, tol, min_height) if min(coverages_dict[A] for A in coverages_dict) < tol: # input tol print([A for A in coverages_dict if coverages_dict[A] < tol]) raise ValueError("Not all coverages are positive, so" + " renormalization cannot be performed since zero" + " division.") def chi(prod, coverages_dict): """Renormalizes production probability p^~ as in Chi paper(22).""" subprobabs = prod.prob() for symbol in prod.rhs(): if not isinstance(symbol, Nonterminal): continue # or subprobabs = 1 else: subprobabs *= coverages_dict[symbol] return subprobabs/coverages_dict[prod.lhs()] prods = [ProbabilisticProduction(prod.lhs(), prod.rhs(), prob=chi(prod, coverages_dict)) for prod in self.grammar.productions()] return PCFG(self.grammar.start(), prods)
def tree_to_production(tree): return ProbabilisticProduction(get_tag(tree), [get_tag(child) for child in tree], **{'prob': 0})
def _read_production(line, nonterm_parser, probabilistic=False): """ Parse a grammar rule, given as a string, and return a list of productions. """ pos = 0 # Parse the left-hand side. lhs, pos = nonterm_parser(line, pos) # Skip over the arrow. m = _ARROW_RE.match(line, pos) if not m: raise ValueError('Expected an arrow') pos = m.end() # Parse the right hand side. probabilities = [0.0] rhsides = [[]] optionals = [[]] # keep track of optional productions while pos < len(line): # Probability. m = _PROBABILITY_RE.match(line, pos) if probabilistic and m: pos = m.end() probabilities[-1] = float(m.group(1)[1:-1]) if probabilities[-1] > 1.0: raise ValueError('Production probability %f, ' 'should not be greater than 1.0' % (probabilities[-1], )) # Vertical bar -- start new rhside. elif line[pos] == '|': m = _DISJUNCTION_RE.match(line, pos) probabilities.append(0.0) rhsides.append([]) optionals.append([]) pos = m.end() # String -- add terminal. elif line[pos] in "\'\"": m = _TERMINAL_RE.match(line, pos) if not m: raise ValueError('Unterminated string') rhsides[-1].append(m.group(1)[1:-1]) optionals[-1].append(False) pos = m.end() # Opening bracket -- start optional production. elif line[pos] == '[': m = _OPTIONAL_RE.match(line, pos) # just get rid of spaces pos = m.end() # should refactor out the following if line[pos] in "\'\"": m = _TERMINAL_RE.match(line, pos) if not m: raise ValueError('Unterminated string') rhsides[-1].append(m.group(1)[1:-1]) pos = m.end() else: nonterm, pos = nonterm_parser(line, pos) # Eats the spaces rhsides[-1].append(nonterm) # end of refactor optionals[-1].append(True) if line[pos] != ']': raise ValueError('Unterminated optional bracket') m = _OPTIONAL_END_RE.match(line, pos) pos = m.end() # Anything else -- nonterminal. else: nonterm, pos = nonterm_parser(line, pos) # Eats the spaces rhsides[-1].append(nonterm) optionals[-1].append(False) # Expand productions with optional elements rhsides_temp = [] for (optionality, rhs) in zip( optionals, rhsides ): # in case there were more than one separated by | (disjunction) if True in optionality: if probabilistic: raise ValueError( 'Optional terms not allowed in probalistic grammar') optterms = [i for (i, isopt) in enumerate(optionality) if isopt] opttermlists = powerset( optterms) # all possible combinations of optionals for optlist in opttermlists: rhstemp = rhs[:] for i in sorted(optlist, reverse=True): del rhstemp[i] rhsides_temp.append(rhstemp) pass else: rhsides_temp.append(rhs) # probablities won't work with optionality! if probabilistic: return [ ProbabilisticProduction(lhs, rhs, prob=probability) for (rhs, probability) in zip(rhsides, probabilities) ] else: return [Production(lhs, rhs) for rhs in rhsides_temp]
def getTerminalProbability(args, pcfg_grammar, list_nonterm): #args.save_dir = args.english_save_dir #(modelen, charsen, vocaben) = getModel(args, 'en') #args.save_dir = args.hindi_save_dir #(modelhi, charhi, vocabhi) = getModel(args, 'hi') p = [] args.nonterm = 'HS' args.save_dir = args.hindi_save_dir args.num_sentence = 1000 args.length = len(args.sentence) segmentshi = [] #print "PCFG grammar", pcfg_grammar (lengthlist, listterminal) = getLength(args, pcfg_grammar, list_nonterm) #print "lengthlist", lengthlist for length in lengthlist: segmentshi.extend(createSegment(length, args.sentence.lower())) probdicthi = getModel(args, 'hi', segmentshi) listProb = probdicthi.values() segmentshi = list(set(segmentshi)) args.nonterm = 'ES' args.save_dir = args.english_save_dir (lengthlist, listterminal) = getLength(args, pcfg_grammar, list_nonterm) segmentsen = [] for length in lengthlist: segmentsen.extend(createSegment(length, args.sentence.lower())) probdicten = getModel(args, 'en', segmentsen) listProb.extend(probdicten.values()) segmentsen = list(set(segmentsen)) listProb = sorted(listProb) denom = (len(listProb) * (len(listProb) + 1)) / 2 prob1 = 0 for segment in segmentshi: probnew = (listProb.index(probdicthi[segment]) + 1.0) / (denom + 1) probnew = float("{0:.8f}".format(round(probnew, 8))) prob1 += probnew #print segment, probnew p.append( ProbabilisticProduction( Nonterminal('HS'), [Nonterminal(token.upper()) for token in segment.split()], prob=probnew)) p.append( ProbabilisticProduction(Nonterminal('HS'), ['Dummy'], prob=(1.0 - prob1))) #print 'HS', prob1, 1.0-prob1 prob1 = 0 for segment in segmentsen: probnew = (listProb.index(probdicten[segment]) + 1.0) / (denom + 1) probnew = float("{0:.8f}".format(round(probnew, 8))) prob1 += probnew p.append( ProbabilisticProduction( Nonterminal('ES'), [Nonterminal(token.upper()) for token in segment.split()], prob=probnew)) #print 'ES',prob1, 1.0-prob1 p.append( ProbabilisticProduction(Nonterminal('ES'), ['Dummy'], prob=(1.0 - prob1))) return p
def _attaching(N, G, C, T): print("attaching...") C_derived = _apply_grammar(G, C) ORs = [] # liste des OR (NonTerminal) for prod in G.productions(): nt = prod.lhs() if "OR" in nt.symbol() and nt not in ORs: ORs.append(nt) ## for each OR symbol O in G do for O in ORs: ## if O leads to a valid expanded bicluster ## as well as a posterior gain (Eq.3) larger than a threshold then # # AND-OR group group = None pos = None # gauche ou droite (impair-False ou pair-True) ## récupération du groupe AND-OR de O for g in biclusters: if O.symbol() in g[1] or O.symbol() in g[2]: group = g break ## récupération de la position de O dand le groupe num = int(O.symbol()[4:]) # numéro du OR, ex: "_OR_2" -> 2 pos = True if num % 2 == 0 else False # # BC_tilde et BC_tilde_prime ## création de BC_t (BC_tilde) BC_t = biclusters[group].copy() ## remplissage de BC_t for pair in _get_bicluster_pairs(BC_t): BC_t.at[pair] = _count_occ(" ".join(pair), C_derived) ## création de BC_t_1 (BC_tilde_prime) (proposed new rule OR -> AND) BC_t_1 = BC_t.copy() ## . remplissage de BC_t_1 if pos == False: ## new row (OR à gauche) new_row = [_count_occ(" ".join((N.symbol(),x)), C) for x in BC_t.columns] BC_t_1.loc[N.symbol(),:] = new_row BC_t_1 = BC_t_1.astype(int) else: ## new column (OR à droite) new_col = [_count_occ(" ".join((x,N.symbol())), C) for x in BC_t.index] BC_t_1.loc[:,N.symbol()] = new_col BC_t_1 = BC_t_1.astype(int) # # EC_tilde et EC_tilde_prime ## création et remplissage de EC_t EC_t = _create_ec(BC_t, C_derived, _create_t(C_derived)) ## création de EC_t_1 EC_t_1 = EC_t.copy() ## . ajout des nouvelles lignes de EC_t_1 if pos == False: ## OR à gauche new_row_indices = [(N.symbol(),col) for col in BC_t_1.columns] else: ## OR à droite new_row_indices = [(row,N.symbol()) for row in BC_t_1.index] ## . remplissage des nouvelles lignes de EC_t_1 for i in new_row_indices: i_str = _tuple_to_ec_index(i, True) EC_t_1.loc[i_str,:] = [-1]*EC_t_1.shape[1] for j in EC_t_1.columns: e, c = " ".join(i), list(_ec_index_to_tuple(j, False)) # expression, contexte c = tuple(["" if _represents_int(x) else x for x in c]) EC_t_1.loc[i_str,j] = _count_occ(" ".join([c[0],e,c[1]]).strip(), C) EC_t_1 = EC_t_1.astype(int) bc_t_1 = BC_t_1.as_matrix() ec_t_1 = EC_t_1.as_matrix() bc_t = BC_t.as_matrix() ec_t = EC_t.as_matrix() # # LOG POSTERIOR GAIN DIFFERENCE (Eq.3) ## BC et EC valid (MC) ? if not _is_mc(bc_t_1) and _is_mc(ec_t_1) and _is_mc(bc_t) and _is_mc(ec_t): continue lpg_diff = _log_posterior_gain(bc_t_1, ec_t_1) lpg_diff -= _log_posterior_gain(bc_t, ec_t) if lpg_diff > LPG_DIFF_THRESHOLD: print("new rule: %s -> %s" % (O.symbol(),N.symbol())) bc = BC_t_1.as_matrix() s = np.sum(bc) row_prob = np.sum(bc, 1)/s col_prob = np.sum(bc, 0)/s ## règles rules = [] for prod in G.productions(): if O.symbol() not in prod.lhs().symbol(): rules.append(prod) ## ajout des nouvelles règles if pos == False: ## OR à gauche probs = row_prob rhs_symbols = [x for x in BC_t.index]+[N] for i in range(BC_t_1.shape[0]): rules.append(ProbabilisticProduction(O, [rhs_symbols[i]], prob=probs[i])) else: ## OR à droite probs = col_prob rhs_symbols = [x for x in BC_t.columns]+[N] for j in range(BC_t_1.shape[1]): rules.append(ProbabilisticProduction(O, [rhs_symbols[j]], prob=probs[j])) ## mises à jour biclusters[group] = BC_t_1.copy() # mise à jour du groupe AND-OR G = PCFG(G.start(), rules) # mise à jour de G C = _reduce_corpus(C, biclusters[group], N, True) # réduction de C T = _create_t(C) # mise à jour de T return G, C, T
probabilities[str(prod)] = 1 lhs_of_prods = set([prod.lhs() for prod in original_production_corpus]) for lhs in lhs_of_prods: number_of_occurrences = 0 for prob in probabilities: if prob.startswith(str(lhs) + " "): number_of_occurrences += probabilities[prob] for prob in probabilities: if prob.startswith(str(lhs) + " "): probabilities[prob] = probabilities[prob] / number_of_occurrences for index in range(len(productions_corpus)): prod = productions_corpus[index] productions_corpus[index] = ProbabilisticProduction(prod.lhs(), prod.rhs(), **{'prob': probabilities[str(prod)]}) productions_toy_pcfg2 = toy_pcfg2.productions() lhs_of_prods = set([str(prod.lhs()) for prod in original_production_corpus] + [str(prod.lhs()) for prod in productions_toy_pcfg2]) def compute_kl_divergence(mle_dist1, mle_dist2): ans = 0 for p in mle_dist1.freqdist(): for q in mle_dist2.freqdist(): if p.rhs() == q.rhs(): ans += p.prob() * math.log(p.prob() / q.prob()) return ans
def baseline(depth=5, n=500): ## symboles non terminaux S = Nonterminal("S") NP = Nonterminal("NP") VP = Nonterminal("VP") PP = Nonterminal("PP") Det = Nonterminal("Det") Vt = Nonterminal("Vt") Vc = Nonterminal("Vc") Vi = Nonterminal("Vi") N = Nonterminal("N") P = Nonterminal("P") ## règles de production probabilistes R = [ ProbabilisticProduction(S, [NP, VP], prob=1.), ProbabilisticProduction(NP, [Det, N], prob=1.), ProbabilisticProduction(VP, [Vt, NP], prob=1 / 3), ProbabilisticProduction(VP, [Vc, PP], prob=1 / 3), ProbabilisticProduction(VP, [Vi], prob=1 / 3), ProbabilisticProduction(PP, [P, NP], prob=1.), ProbabilisticProduction(Det, ["a"], prob=.5), ProbabilisticProduction(Det, ["the"], prob=.5), ProbabilisticProduction(Vt, ["touches"], prob=.5), ProbabilisticProduction(Vt, ["covers"], prob=.5), ProbabilisticProduction(Vi, ["rolls"], prob=.5), ProbabilisticProduction(Vi, ["bounces"], prob=.5), ProbabilisticProduction(Vc, ["is"], prob=1.), ProbabilisticProduction(N, ["circle"], prob=1 / 3), ProbabilisticProduction(N, ["square"], prob=1 / 3), ProbabilisticProduction(N, ["triangle"], prob=1 / 3), ProbabilisticProduction(P, ["above"], prob=.5), ProbabilisticProduction(P, ["below"], prob=.5) ] G = PCFG(S, R) # grammaire C = "" # corpus ## toutes les phrases possibles print("\n") for n, sent in enumerate(generate.generate(G, depth=depth, n=n), 1): s = ' '.join(sent) C += s + '. ' print('%3d. %s%s' % (n, s, '.')) return G, C
def langley_1(depth=5, n=500): ## symboles non terminaux S = Nonterminal("S") NP = Nonterminal("NP") VP = Nonterminal("VP") AP = Nonterminal("AP") Adj = Nonterminal("Adj") Det = Nonterminal("Det") Vt = Nonterminal("Vt") Vi = Nonterminal("Vi") N = Nonterminal("N") ## règles de production probabilistes R = [ ProbabilisticProduction(S, [NP, VP], prob=1.), ProbabilisticProduction(VP, [Vi], prob=.5), ProbabilisticProduction(VP, [Vt, NP], prob=.5), ProbabilisticProduction(NP, [Det, N], prob=.5), ProbabilisticProduction(NP, [Det, AP, N], prob=.5), ProbabilisticProduction(AP, [Adj], prob=.5), ProbabilisticProduction(AP, [Adj, AP], prob=.5), ProbabilisticProduction(Det, ["the"], prob=1.), ProbabilisticProduction(Vt, ["saw"], prob=.5), ProbabilisticProduction(Vt, ["heard"], prob=.5), ProbabilisticProduction(Vi, ["ate"], prob=.5), ProbabilisticProduction(Vi, ["slept"], prob=.5), ProbabilisticProduction(N, ["cat"], prob=.5), ProbabilisticProduction(N, ["dog"], prob=.5), ProbabilisticProduction(Adj, ["big"], prob=.5), ProbabilisticProduction(Adj, ["old"], prob=.5) ] G = PCFG(S, R) # grammaire C = "" # corpus ## toutes les phrases possibles print("\n") for n, sent in enumerate(generate.generate(G, depth=depth, n=n), 1): s = ' '.join(sent) C += s + '. ' print('%3d. %s%s' % (n, s, '.')) return G, C