def show_clusters_of_concepts(*dbs): ''' Takes a sequence of databases and performs a multi-dimensional scaling on the synsets given by the merged set of 'concept' domains in them. ''' domains = mergedom(*map(lambda d: d.domains, dbs)) concepts = domains.get('concept', None) if concepts is None: logger.error('Domain "concepts" not found in databases.') return if 'null' in concepts: # remove the null concept del concepts[concepts.index('null')] synsets = map(lambda x: wn.synset(x), concepts) # @UndefinedVariable distance = zeros((len(synsets),len(synsets))) for (i, pointi) in enumerate(synsets): for (j, pointj) in enumerate(synsets): sys.stdout.write('{:f} / {:f} \r'.format(i, len(synsets))) sim = synsets[i].path_similarity(synsets[j]) if sim is None: sim = 0 distance[i,j] = 1. - sim Y, eig = doMDS(distance, dimensions=2) pylab.figure(1) for i, s in enumerate(synsets): text(Y[i,0],Y[i,1], s.name, fontsize=8) pylab.plot(Y[:,0],Y[:,1],'.') pylab.show()
def materialize(self, dbs): ''' For each noisy domain, (1) if there is a static domain specification, map the values of that domain in all dbs to their closest neighbor in the domain. (2) If there is no static domain declaration, apply SAHN clustering to the values appearing dbs, take the cluster centroids as the values of the domain and map the dbs as in (1). ''' fulldomains = mergedom(*[db.domains for db in dbs]) for domain in self.domains: if fulldomains.get(domain, None) is None: continue # apply the clustering step values = fulldomains[domain] clusters = SAHN(values) self.clusters[domain] = clusters self.noisy_domains[domain] = [ c._computeCentroid()[0] for c in clusters ] if self.verbose: self.log.info(' reducing domain %s: %d -> %d values' % (domain, len(values), len(clusters))) self.log.info(' %s', str(self.noisy_domains[domain])) return self.transform_dbs(dbs)
def merge_all_domains(pracinference): all_dbs = [] for step in pracinference.inference_steps: all_dbs.extend(step.input_dbs) all_dbs.extend(step.output_dbs) fullDomain = mergedom(*[db.domains for db in all_dbs]) return fullDomain
def get_similarities(self, *dbs): ''' Returns a database containing all possible similarities for the senses of the input databases. :param dbs: the input databases to be queried :return: a generator yielding an instance of Database ''' wordnet = self.prac.wordnet full_domain = mergedom(*[db.domains for db in dbs]) for db in dbs: db_ = Database(self.mln) for q in db.query('has_sense(?w, ?s) ^ is_a(?s, ?c)'): sense = q['?s'] concept = q['?c'] for c in full_domain['concept']: sim = wordnet.similarity(c, concept) db_ << ('is_a({},{})'.format(sense, c), sim) yield db_
def addFuzzyEvidenceToDBs(self, *dbs): ''' Adds to the databases dbs all fuzzy 'is_a' relationships for all senses contained in the DB and in the MLN. (has side effects on the original one) ''' mln_domains = dbs[0].domains domains_full = mergedom(mln_domains, *[db.domains for db in dbs]) concepts = domains_full['concept'] wordnet = WordNet() for db in dbs: for res in db.query('is_a(?sense, ?concept)'): sense = res['?sense'] concept = res['?concept'] for c in concepts: similarity = wordnet.semilarity(concept, c) logger.info('{} ~ {} = {:.2f}'.format( concept, c, similarity)) db << ('is_a({},{})'.format(sense, c), similarity) return dbs
def __call__(self, node, **params): # ====================================================================== # Initialization # ====================================================================== logger.debug('inference on {}'.format(self.name)) if self.prac.verbose > 0: print prac_heading('Resolving Coreferences') preds = list( node.rdfs( goaltest=lambda n: isinstance(n, FrameNode) and not n.children, all=True))[:2] dbs = node.outdbs infstep = PRACInferenceStep(node, self) projectpath = os.path.join(pracloc.pracmodules, self.name) ac = None pngs = {} # if not preds: return [] # ====================================================================== # Preprocessing # ====================================================================== # merge output dbs from senses_and_roles step, containing # roles inferred from multiple sentences. if not preds: # no coreferencing required - forward dbs and settings # from previous module infstep.indbs = [db.copy() for db in dbs] infstep.outdbs = [db.copy() for db in infstep.indbs] logger.debug( '%s has no predecessors. Nothing to do here. Passing db...' % node) return [node] # retrieve all words from the dbs to calculate distances. # Do not use pracinference.instructions as they are not # annotated by the Stanford parser. sentences = [db.words() for pred in preds for db in pred.indbs] infstep.indbs = [db.copy() for db in dbs] # infstep.outdbs = [db.copy() for db in infstep.indbs] # query action core to load corresponding project actioncore = node.frame.actioncore # clear corefdb and unify current db with the two preceding ones corefdb = PRACDatabase(self.prac) corefdb = corefdb.union(dbs, self.prac.mln) # for s in range(max(0, i - 2), i+1): # corefdb = corefdb.union(dbs[s], self.prac.mln) for pred in preds: logger.debug('unifying with %s' % pred) for db in pred.indbs: corefdb = corefdb.union(db, self.prac.mln) # remove all senses from the databases' domain that are not # assigned to any word. for q in corefdb.query('!(EXIST ?w (has_sense(?w,?sense)))'): corefdb.rmval('sense', q['?sense']) try: # preprocessing: adding distance information for each # word in the instructions # s = words[max(0, i - 2):i+1] # snts = list(enumerate(s)) # idx = len(snts) - 1 # idx of current sentence # for s in snts[:-1]: # idx2 = s[0] # for w in s[1]: # corefdb << 'distance({},DIST{})'.format(w, idx - idx2) for sidx, s in enumerate(sentences): for w in s: cont = True for q in corefdb.query('distance({}, ?w)'.format(w)): cont = False break if not cont: continue corefdb << 'distance({},DIST{})'.format(w, sidx) # print 'distance({},DIST{})'.format(w, sidx) logger.debug('loading Project: {}'.format( colorize(actioncore, (None, 'cyan', True), True))) project = MLNProject.open( os.path.join(projectpath, '{}.pracmln'.format(actioncore))) mlntext = project.mlns.get(project.queryconf['mln'], None) mln = parse_mln(mlntext, searchpaths=[self.module_path], projectpath=projectpath, logic=project.queryconf.get('logic', 'FuzzyLogic'), grammar=project.queryconf.get( 'grammar', 'PRACGrammar')) except MLNParsingError: logger.warning( 'Could not use MLN in project {} for coreference resolution'. format(colorize(actioncore, (None, 'cyan', True), True))) infstep.outdbs = [db.copy(self.prac.mln) for db in dbs] infstep.png = node.parent.laststep.png infstep.applied_settings = node.parent.laststep.applied_settings return [node] except Exception: infstep.outdbs = [db.copy(self.prac.mln) for db in dbs] infstep.png = node.parent.laststep.png infstep.applied_settings = node.parent.laststep.applied_settings logger.warning( 'Could not load project "{}". Passing dbs to next module...'. format(ac)) return [node] # adding similarities wnmod = self.prac.module('wn_senses') newdatabase = wnmod.add_sims(corefdb, mln) # update queries depending on missing roles acroles = filter(lambda role: role != 'action_verb', self.prac.actioncores[actioncore].roles) missingroles = [ ar for ar in acroles if len(list(newdatabase.query('{}(?w,{})'.format(ar, actioncore)))) == 0 ] conf = project.queryconf conf.update({'queries': ','.join(missingroles)}) print colorize('querying for missing roles {}'.format(conf['queries']), (None, 'green', True), True) # asserting impossible role-ac combinations, leaving previously # inferred roles untouched fulldom = mergedom(mln.domains, newdatabase.domains) ac_domains = [dom for dom in fulldom if '_ac' in dom] acs = list(set([v for a in ac_domains for v in fulldom[a]])) acs = filter(lambda ac_: ac_ != actioncore, acs) for ac1 in acs: for r in missingroles: for w in newdatabase.domains['word']: # words with no sense are asserted false if list( corefdb.query( '!(EXIST ?sense (has_sense({},?sense)))'. format(w))): newdatabase << '!{}({},{})'.format(r, w, actioncore) # leave previously inferred information roles # untouched if list(newdatabase.query('{}({},{})'.format(r, w, ac1))): continue else: newdatabase << '!{}({},{})'.format(r, w, ac1) try: # ========================================================== # Inference # ========================================================== infer = self.mlnquery(config=conf, verbose=self.prac.verbose > 2, db=newdatabase, mln=mln) if self.prac.verbose == 2: print print prac_heading('INFERENCE RESULTS') infer.write() # ========================================================== # Postprocessing # ========================================================== # merge initial db with results for db in infstep.indbs: resultdb = db.copy() for res in infer.results.keys(): if infer.results[res] != 1.0: continue resultdb << str(res) _, _, args = self.prac.mln.logic.parse_literal(res) w = args[0] for q in newdatabase.query( 'has_sense({0},?s) ^ has_pos({0},?pos)'.format(w)): resultdb << 'has_sense({},{})'.format(w, q['?s']) resultdb << 'is_a({0},{0})'.format(q['?s']) resultdb << 'has_pos({},{})'.format(w, q['?pos']) resultdb = wnmod.add_sims(resultdb, mln) # enhance the frame data for mrole in missingroles: for q in resultdb.query( '{role}(?w, {actioncore}) ^ has_sense(?w, ?s)'. format(role=mrole, actioncore=actioncore)): for p in preds: if p.frame.object(q['?w']) is not None: node.frame.actionroles[mrole] = p.frame.object( q['?w']) break infstep.outdbs.append(resultdb) pprint(node.frame.tojson()) except NoConstraintsError: logger.debug('No coreferences found. Passing db...') infstep.outdbs.append(db) except Exception: logger.error('Something went wrong') traceback.print_exc() pngs['Coref - ' + str(node)] = get_cond_prob_png(project.queryconf.get( 'queries', ''), dbs, filename=self.name) infstep.png = pngs infstep.applied_settings = project.queryconf.config return [node]
def materialize(self, *dbs): ''' Materializes this MLN with respect to the databases given. This must be called before learning or inference can take place. Returns a new MLN instance containing expanded formula templates and materialized weights. Normally, this method should not be called from the outside. Also takes into account whether or not particular domain values or predictaes are actually used in the data, i.e. if a predicate is not used in any of the databases, all formulas that make use of this predicate are ignored. :param dbs: list of :class:`database.Database` objects for materialization. ''' logger.debug("materializing formula templates...") # obtain full domain with all objects fulldomain = mergedom(self.domains, *[db.domains for db in dbs]) logger.debug('full domains: %s' % fulldomain) mln_ = self.copy() # collect the admissible formula templates. templates might be not # admissible since the domain of a template variable might be empty. for ft in list(mln_.formulas): domnames = ft.vardoms().values() if any([domname not in fulldomain for domname in domnames]): logger.debug('Discarding formula template %s, since it cannot be grounded (domain(s) %s empty).' % \ (fstr(ft), ','.join([d for d in domnames if d not in fulldomain]))) mln_.rmf(ft) # collect the admissible predicates. a predicate may become inadmissible # if either the domain of one of its arguments is empty or there is # no formula containing the respective predicate. predicates_used = set() for _, f in mln_.iterformulas(): predicates_used.update(f.prednames()) for predicate in self.iterpreds(): remove = False if any([not dom in fulldomain for dom in predicate.argdoms]): logger.debug( 'Discarding predicate %s, since it cannot be grounded.' % (predicate.name)) remove = True if predicate.name not in predicates_used: logger.debug('Discarding predicate %s, since it is unused.' % predicate.name) remove = True if remove: del mln_._predicates[predicate.name] # permanently transfer domains of variables that were expanded from templates for _, ft in mln_.iterformulas(): domnames = ft.template_variables().values() for domname in domnames: mln_.domains[domname] = fulldomain[domname] # materialize the formula templates mln__ = mln_.copy() mln__._rmformulas() for i, template in mln_.iterformulas(): for variant in template.template_variants(): idx = len(mln__._formulas) f = mln__.formula( variant, weight=template.weight if isinstance( template.weight, basestring) else template.weight, fixweight=mln_.fixweights[i]) f.idx = idx mln__._materialized = True return mln__
Takes a sequence of databases and performs a multi-dimensional scaling on the synsets given by the merged set of 'concept' domains in them. ''' domains = mergedom(*map(lambda d: d.domains, dbs)) concepts = domains.get('concept', None) if concepts is None: logger.error('Domain "concepts" not found in databases.') return if 'null' in concepts: # remove the null concept del concepts[concepts.index('null')] synsets = map(lambda x: wn.synset(x), concepts) # @UndefinedVariable distance = zeros((len(synsets),len(synsets))) for (i, pointi) in enumerate(synsets): for (j, pointj) in enumerate(synsets): sys.stdout.write('{:f} / {:f} \r'.format(i, len(synsets))) sim = synsets[i].path_similarity(synsets[j]) if sim is None: sim = 0 distance[i,j] = 1. - sim Y, eig = doMDS(distance, dimensions=2) pylab.figure(1) for i, s in enumerate(synsets): text(Y[i,0],Y[i,1], s.name, fontsize=8) pylab.plot(Y[:,0],Y[:,1],'.') pylab.show() if __name__ == '__main__': prac = PRAC() dbs = list(Database.load(prac.mln, os.path.join('/', 'home', 'nyga', 'work', 'nl_corpora', 'wikihow', 'Slicing.db'))) domains = mergedom(*map(lambda d: d.domains, dbs)) concepts = domains.get('concept', None)
def materialize(self, *dbs): ''' Materializes this MLN with respect to the databases given. This must be called before learning or inference can take place. Returns a new MLN instance containing expanded formula templates and materialized weights. Normally, this method should not be called from the outside. Also takes into account whether or not particular domain values or predictaes are actually used in the data, i.e. if a predicate is not used in any of the databases, all formulas that make use of this predicate are ignored. :param dbs: list of :class:`database.Database` objects for materialization. ''' logger.debug("materializing formula templates...") # obtain full domain with all objects fulldomain = mergedom(self.domains, *[db.domains for db in dbs]) logger.debug('full domains: %s' % fulldomain) mln_ = self.copy() # collect the admissible formula templates. templates might be not # admissible since the domain of a template variable might be empty. for ft in list(mln_.formulas): domnames = ft.vardoms().values() if any([not domname in fulldomain for domname in domnames]): logger.debug('Discarding formula template %s, since it cannot be grounded (domain(s) %s empty).' % \ (fstr(ft), ','.join([d for d in domnames if d not in fulldomain]))) mln_.rmf(ft) # collect the admissible predicates. a predicate may become inadmissible # if either the domain of one of its arguments is empty or there is # no formula containing the respective predicate. predicates_used = set() for _, f in mln_.iterformulas(): predicates_used.update(f.prednames()) for predicate in self.iterpreds(): remove = False if any([not dom in fulldomain for dom in predicate.argdoms]): logger.debug('Discarding predicate %s, since it cannot be grounded.' % (predicate.name)) remove = True if predicate.name not in predicates_used: logger.debug('Discarding predicate %s, since it is unused.' % predicate.name) remove = True if remove: del mln_._predicates[predicate.name] # permanently transfer domains of variables that were expanded from templates for _, ft in mln_.iterformulas(): domnames = ft.template_variables().values() for domname in domnames: mln_.domains[domname] = fulldomain[domname] # materialize the formula templates mln__ = mln_.copy() mln__ ._rmformulas() for i, template in mln_.iterformulas(): for variant in template.template_variants(): idx = len(mln__._formulas) f = mln__.formula(variant, weight=template.weight if isinstance(template.weight, basestring) else template.weight, fixweight=mln_.fixweights[i]) f.idx = idx mln__._materialized = True return mln__
Takes a sequence of databases and performs a multi-dimensional scaling on the synsets given by the merged set of 'concept' domains in them. ''' domains = mergedom(*[d.domains for d in dbs]) concepts = domains.get('concept', None) if concepts is None: logger.error('Domain "concepts" not found in databases.') return if 'null' in concepts: # remove the null concept del concepts[concepts.index('null')] synsets = [wn.synset(x) for x in concepts] # @UndefinedVariable distance = zeros((len(synsets),len(synsets))) for (i, pointi) in enumerate(synsets): for (j, pointj) in enumerate(synsets): sys.stdout.write('{:f} / {:f} \r'.format(i, len(synsets))) sim = synsets[i].path_similarity(synsets[j]) if sim is None: sim = 0 distance[i,j] = 1. - sim Y, eig = doMDS(distance, dimensions=2) pylab.figure(1) for i, s in enumerate(synsets): text(Y[i,0],Y[i,1], s.name, fontsize=8) pylab.plot(Y[:,0],Y[:,1],'.') pylab.show() if __name__ == '__main__': prac = PRAC() dbs = list(Database.load(prac.mln, os.path.join('/', 'home', 'nyga', 'work', 'nl_corpora', 'wikihow', 'Slicing.db'))) domains = mergedom(*[d.domains for d in dbs]) concepts = domains.get('concept', None)