def fill_missing_words(grammar: PCFG, missing_words: Set[str]): # UNK -> word1 | word2 | ... | wordN unknown = Nonterminal('UNK') unk_rules = [ Production(unknown, [missing_word]) for missing_word in missing_words ] # Add UNK as a possibility to all rules with strings in the right hand side corrected_rules: List[Nonterminal] = [] rule: ProbabilisticProduction for rule in grammar.productions(): # right hand side has a string somewhere if any(isinstance(element, str) for element in rule.rhs()): # rule has already been corrected if rule.lhs() in corrected_rules: continue unk_rules.append(Production(rule.lhs(), [unknown])) corrected_rules.append(rule.lhs()) return induce_pcfg(grammar.start(), grammar.productions() + unk_rules)
def _attaching(N, G, C, T): print("attaching...") C_derived = _apply_grammar(G, C) ORs = [] # liste des OR (NonTerminal) for prod in G.productions(): nt = prod.lhs() if "OR" in nt.symbol() and nt not in ORs: ORs.append(nt) ## for each OR symbol O in G do for O in ORs: ## if O leads to a valid expanded bicluster ## as well as a posterior gain (Eq.3) larger than a threshold then # # AND-OR group group = None pos = None # gauche ou droite (impair-False ou pair-True) ## récupération du groupe AND-OR de O for g in biclusters: if O.symbol() in g[1] or O.symbol() in g[2]: group = g break ## récupération de la position de O dand le groupe num = int(O.symbol()[4:]) # numéro du OR, ex: "_OR_2" -> 2 pos = True if num % 2 == 0 else False # # BC_tilde et BC_tilde_prime ## création de BC_t (BC_tilde) BC_t = biclusters[group].copy() ## remplissage de BC_t for pair in _get_bicluster_pairs(BC_t): BC_t.at[pair] = _count_occ(" ".join(pair), C_derived) ## création de BC_t_1 (BC_tilde_prime) (proposed new rule OR -> AND) BC_t_1 = BC_t.copy() ## . remplissage de BC_t_1 if pos == False: ## new row (OR à gauche) new_row = [_count_occ(" ".join((N.symbol(),x)), C) for x in BC_t.columns] BC_t_1.loc[N.symbol(),:] = new_row BC_t_1 = BC_t_1.astype(int) else: ## new column (OR à droite) new_col = [_count_occ(" ".join((x,N.symbol())), C) for x in BC_t.index] BC_t_1.loc[:,N.symbol()] = new_col BC_t_1 = BC_t_1.astype(int) # # EC_tilde et EC_tilde_prime ## création et remplissage de EC_t EC_t = _create_ec(BC_t, C_derived, _create_t(C_derived)) ## création de EC_t_1 EC_t_1 = EC_t.copy() ## . ajout des nouvelles lignes de EC_t_1 if pos == False: ## OR à gauche new_row_indices = [(N.symbol(),col) for col in BC_t_1.columns] else: ## OR à droite new_row_indices = [(row,N.symbol()) for row in BC_t_1.index] ## . remplissage des nouvelles lignes de EC_t_1 for i in new_row_indices: i_str = _tuple_to_ec_index(i, True) EC_t_1.loc[i_str,:] = [-1]*EC_t_1.shape[1] for j in EC_t_1.columns: e, c = " ".join(i), list(_ec_index_to_tuple(j, False)) # expression, contexte c = tuple(["" if _represents_int(x) else x for x in c]) EC_t_1.loc[i_str,j] = _count_occ(" ".join([c[0],e,c[1]]).strip(), C) EC_t_1 = EC_t_1.astype(int) bc_t_1 = BC_t_1.as_matrix() ec_t_1 = EC_t_1.as_matrix() bc_t = BC_t.as_matrix() ec_t = EC_t.as_matrix() # # LOG POSTERIOR GAIN DIFFERENCE (Eq.3) ## BC et EC valid (MC) ? if not _is_mc(bc_t_1) and _is_mc(ec_t_1) and _is_mc(bc_t) and _is_mc(ec_t): continue lpg_diff = _log_posterior_gain(bc_t_1, ec_t_1) lpg_diff -= _log_posterior_gain(bc_t, ec_t) if lpg_diff > LPG_DIFF_THRESHOLD: print("new rule: %s -> %s" % (O.symbol(),N.symbol())) bc = BC_t_1.as_matrix() s = np.sum(bc) row_prob = np.sum(bc, 1)/s col_prob = np.sum(bc, 0)/s ## règles rules = [] for prod in G.productions(): if O.symbol() not in prod.lhs().symbol(): rules.append(prod) ## ajout des nouvelles règles if pos == False: ## OR à gauche probs = row_prob rhs_symbols = [x for x in BC_t.index]+[N] for i in range(BC_t_1.shape[0]): rules.append(ProbabilisticProduction(O, [rhs_symbols[i]], prob=probs[i])) else: ## OR à droite probs = col_prob rhs_symbols = [x for x in BC_t.columns]+[N] for j in range(BC_t_1.shape[1]): rules.append(ProbabilisticProduction(O, [rhs_symbols[j]], prob=probs[j])) ## mises à jour biclusters[group] = BC_t_1.copy() # mise à jour du groupe AND-OR G = PCFG(G.start(), rules) # mise à jour de G C = _reduce_corpus(C, biclusters[group], N, True) # réduction de C T = _create_t(C) # mise à jour de T return G, C, T