Exemplo n.º 1
0
def show_clusters_of_concepts(*dbs):
    '''
    Takes a sequence of databases and performs a multi-dimensional scaling on
    the synsets given by the merged set of 'concept' domains in them.
    '''
    domains = mergedom(*map(lambda d: d.domains, dbs))
    concepts = domains.get('concept', None)
    if concepts is None:
        logger.error('Domain "concepts" not found in databases.')
        return
    if 'null' in concepts: # remove the null concept
        del concepts[concepts.index('null')]
    synsets = map(lambda x: wn.synset(x), concepts)  # @UndefinedVariable
    distance = zeros((len(synsets),len(synsets)))
    for (i, pointi) in enumerate(synsets):
        for (j, pointj) in enumerate(synsets):
            sys.stdout.write('{:f} / {:f}      \r'.format(i, len(synsets)))
            sim = synsets[i].path_similarity(synsets[j])
            if sim is None: sim = 0
            distance[i,j] = 1. - sim
    Y, eig = doMDS(distance, dimensions=2)
    pylab.figure(1)
    for i, s in enumerate(synsets):
        text(Y[i,0],Y[i,1], s.name, fontsize=8)
    pylab.plot(Y[:,0],Y[:,1],'.')
    pylab.show()
Exemplo n.º 2
0
 def materialize(self, dbs):
     '''
     For each noisy domain, (1) if there is a static domain specification,
     map the values of that domain in all dbs to their closest neighbor
     in the domain.
     (2) If there is no static domain declaration, apply SAHN clustering
     to the values appearing dbs, take the cluster centroids as the values
     of the domain and map the dbs as in (1).
     '''
     fulldomains = mergedom(*[db.domains for db in dbs])
     for domain in self.domains:
         if fulldomains.get(domain, None) is None:
             continue
         # apply the clustering step
         values = fulldomains[domain]
         clusters = SAHN(values)
         self.clusters[domain] = clusters
         self.noisy_domains[domain] = [
             c._computeCentroid()[0] for c in clusters
         ]
         if self.verbose:
             self.log.info('  reducing domain %s: %d -> %d values' %
                           (domain, len(values), len(clusters)))
             self.log.info('   %s', str(self.noisy_domains[domain]))
     return self.transform_dbs(dbs)
Exemplo n.º 3
0
 def merge_all_domains(pracinference):
     all_dbs = []
     for step in pracinference.inference_steps:
         all_dbs.extend(step.input_dbs)
         all_dbs.extend(step.output_dbs)
     fullDomain = mergedom(*[db.domains for db in all_dbs])
     return fullDomain
Exemplo n.º 4
0
    def get_similarities(self, *dbs):
        '''
        Returns a database containing all possible similarities for the senses
        of the input databases.

        :param dbs: the input databases to be queried
        :return:    a generator yielding an instance of Database
        '''
        wordnet = self.prac.wordnet
        full_domain = mergedom(*[db.domains for db in dbs])
        for db in dbs:
            db_ = Database(self.mln)
            for q in db.query('has_sense(?w, ?s) ^ is_a(?s, ?c)'):
                sense = q['?s']
                concept = q['?c']
                for c in full_domain['concept']:
                    sim = wordnet.similarity(c, concept)
                    db_ << ('is_a({},{})'.format(sense, c), sim)
            yield db_
Exemplo n.º 5
0
 def addFuzzyEvidenceToDBs(self, *dbs):
     '''
     Adds to the databases dbs all fuzzy 'is_a' relationships
     for all senses contained in the DB and in the MLN.
     (has side effects on the original one)
     '''
     mln_domains = dbs[0].domains
     domains_full = mergedom(mln_domains, *[db.domains for db in dbs])
     concepts = domains_full['concept']
     wordnet = WordNet()
     for db in dbs:
         for res in db.query('is_a(?sense, ?concept)'):
             sense = res['?sense']
             concept = res['?concept']
             for c in concepts:
                 similarity = wordnet.semilarity(concept, c)
                 logger.info('{} ~ {} = {:.2f}'.format(
                     concept, c, similarity))
                 db << ('is_a({},{})'.format(sense, c), similarity)
     return dbs
Exemplo n.º 6
0
    def __call__(self, node, **params):

        # ======================================================================
        # Initialization
        # ======================================================================

        logger.debug('inference on {}'.format(self.name))

        if self.prac.verbose > 0:
            print prac_heading('Resolving Coreferences')

        preds = list(
            node.rdfs(
                goaltest=lambda n: isinstance(n, FrameNode) and not n.children,
                all=True))[:2]
        dbs = node.outdbs
        infstep = PRACInferenceStep(node, self)
        projectpath = os.path.join(pracloc.pracmodules, self.name)
        ac = None
        pngs = {}

        #         if not preds: return []
        # ======================================================================
        # Preprocessing
        # ======================================================================

        # merge output dbs from senses_and_roles step, containing
        # roles inferred from multiple sentences.
        if not preds:
            # no coreferencing required - forward dbs and settings
            # from previous module
            infstep.indbs = [db.copy() for db in dbs]
            infstep.outdbs = [db.copy() for db in infstep.indbs]
            logger.debug(
                '%s has no predecessors. Nothing to do here. Passing db...' %
                node)
            return [node]

        # retrieve all words from the dbs to calculate distances.
        # Do not use pracinference.instructions as they are not
        # annotated by the Stanford parser.
        sentences = [db.words() for pred in preds for db in pred.indbs]
        infstep.indbs = [db.copy() for db in dbs]
        #         infstep.outdbs = [db.copy() for db in infstep.indbs]
        # query action core to load corresponding project

        actioncore = node.frame.actioncore
        # clear corefdb and unify current db with the two preceding ones
        corefdb = PRACDatabase(self.prac)
        corefdb = corefdb.union(dbs, self.prac.mln)
        #         for s in range(max(0, i - 2), i+1):
        #             corefdb = corefdb.union(dbs[s], self.prac.mln)
        for pred in preds:
            logger.debug('unifying with %s' % pred)
            for db in pred.indbs:
                corefdb = corefdb.union(db, self.prac.mln)

        # remove all senses from the databases' domain that are not
        # assigned to any word.
        for q in corefdb.query('!(EXIST ?w (has_sense(?w,?sense)))'):
            corefdb.rmval('sense', q['?sense'])
        try:
            # preprocessing: adding distance information for each
            # word in the instructions
            #             s = words[max(0, i - 2):i+1]
            #             snts = list(enumerate(s))
            #             idx = len(snts) - 1  # idx of current sentence
            #             for s in snts[:-1]:
            #                 idx2 = s[0]
            #                 for w in s[1]:
            #                     corefdb << 'distance({},DIST{})'.format(w, idx - idx2)
            for sidx, s in enumerate(sentences):
                for w in s:
                    cont = True
                    for q in corefdb.query('distance({}, ?w)'.format(w)):
                        cont = False
                        break
                    if not cont: continue
                    corefdb << 'distance({},DIST{})'.format(w, sidx)
#                     print 'distance({},DIST{})'.format(w, sidx)

            logger.debug('loading Project: {}'.format(
                colorize(actioncore, (None, 'cyan', True), True)))
            project = MLNProject.open(
                os.path.join(projectpath, '{}.pracmln'.format(actioncore)))
            mlntext = project.mlns.get(project.queryconf['mln'], None)
            mln = parse_mln(mlntext,
                            searchpaths=[self.module_path],
                            projectpath=projectpath,
                            logic=project.queryconf.get('logic', 'FuzzyLogic'),
                            grammar=project.queryconf.get(
                                'grammar', 'PRACGrammar'))
        except MLNParsingError:
            logger.warning(
                'Could not use MLN in project {} for coreference resolution'.
                format(colorize(actioncore, (None, 'cyan', True), True)))
            infstep.outdbs = [db.copy(self.prac.mln) for db in dbs]
            infstep.png = node.parent.laststep.png
            infstep.applied_settings = node.parent.laststep.applied_settings
            return [node]
        except Exception:
            infstep.outdbs = [db.copy(self.prac.mln) for db in dbs]
            infstep.png = node.parent.laststep.png
            infstep.applied_settings = node.parent.laststep.applied_settings
            logger.warning(
                'Could not load project "{}". Passing dbs to next module...'.
                format(ac))
            return [node]

        # adding similarities
        wnmod = self.prac.module('wn_senses')
        newdatabase = wnmod.add_sims(corefdb, mln)

        # update queries depending on missing roles
        acroles = filter(lambda role: role != 'action_verb',
                         self.prac.actioncores[actioncore].roles)
        missingroles = [
            ar for ar in acroles
            if len(list(newdatabase.query('{}(?w,{})'.format(ar, actioncore))))
            == 0
        ]
        conf = project.queryconf
        conf.update({'queries': ','.join(missingroles)})
        print colorize('querying for missing roles {}'.format(conf['queries']),
                       (None, 'green', True), True)

        # asserting impossible role-ac combinations, leaving previously
        # inferred roles untouched
        fulldom = mergedom(mln.domains, newdatabase.domains)
        ac_domains = [dom for dom in fulldom if '_ac' in dom]
        acs = list(set([v for a in ac_domains for v in fulldom[a]]))
        acs = filter(lambda ac_: ac_ != actioncore, acs)

        for ac1 in acs:
            for r in missingroles:
                for w in newdatabase.domains['word']:
                    # words with no sense are asserted false
                    if list(
                            corefdb.query(
                                '!(EXIST ?sense (has_sense({},?sense)))'.
                                format(w))):
                        newdatabase << '!{}({},{})'.format(r, w, actioncore)
                    # leave previously inferred information roles
                    # untouched
                    if list(newdatabase.query('{}({},{})'.format(r, w, ac1))):
                        continue
                    else:
                        newdatabase << '!{}({},{})'.format(r, w, ac1)
        try:
            # ==========================================================
            # Inference
            # ==========================================================
            infer = self.mlnquery(config=conf,
                                  verbose=self.prac.verbose > 2,
                                  db=newdatabase,
                                  mln=mln)
            if self.prac.verbose == 2:
                print
                print prac_heading('INFERENCE RESULTS')
                infer.write()
            # ==========================================================
            # Postprocessing
            # ==========================================================
            # merge initial db with results
            for db in infstep.indbs:
                resultdb = db.copy()
                for res in infer.results.keys():
                    if infer.results[res] != 1.0:
                        continue
                    resultdb << str(res)
                    _, _, args = self.prac.mln.logic.parse_literal(res)
                    w = args[0]
                    for q in newdatabase.query(
                            'has_sense({0},?s) ^ has_pos({0},?pos)'.format(w)):
                        resultdb << 'has_sense({},{})'.format(w, q['?s'])
                        resultdb << 'is_a({0},{0})'.format(q['?s'])
                        resultdb << 'has_pos({},{})'.format(w, q['?pos'])
                resultdb = wnmod.add_sims(resultdb, mln)
                # enhance the frame data
                for mrole in missingroles:
                    for q in resultdb.query(
                            '{role}(?w, {actioncore}) ^ has_sense(?w, ?s)'.
                            format(role=mrole, actioncore=actioncore)):
                        for p in preds:
                            if p.frame.object(q['?w']) is not None:
                                node.frame.actionroles[mrole] = p.frame.object(
                                    q['?w'])
                                break
                infstep.outdbs.append(resultdb)
            pprint(node.frame.tojson())
        except NoConstraintsError:
            logger.debug('No coreferences found. Passing db...')
            infstep.outdbs.append(db)
        except Exception:
            logger.error('Something went wrong')
            traceback.print_exc()

        pngs['Coref - ' + str(node)] = get_cond_prob_png(project.queryconf.get(
            'queries', ''),
                                                         dbs,
                                                         filename=self.name)
        infstep.png = pngs
        infstep.applied_settings = project.queryconf.config
        return [node]
Exemplo n.º 7
0
    def materialize(self, *dbs):
        '''
        Materializes this MLN with respect to the databases given. This must
        be called before learning or inference can take place.
        
        Returns a new MLN instance containing expanded formula templates and
        materialized weights. Normally, this method should not be called from the outside. 
        Also takes into account whether or not particular domain values or predictaes
        are actually used in the data, i.e. if a predicate is not used in any
        of the databases, all formulas that make use of this predicate are ignored.

        :param dbs:     list of :class:`database.Database` objects for materialization.
        '''
        logger.debug("materializing formula templates...")

        # obtain full domain with all objects
        fulldomain = mergedom(self.domains, *[db.domains for db in dbs])
        logger.debug('full domains: %s' % fulldomain)
        mln_ = self.copy()
        # collect the admissible formula templates. templates might be not
        # admissible since the domain of a template variable might be empty.
        for ft in list(mln_.formulas):
            domnames = ft.vardoms().values()
            if any([domname not in fulldomain for domname in domnames]):
                logger.debug('Discarding formula template %s, since it cannot be grounded (domain(s) %s empty).' % \
                    (fstr(ft), ','.join([d for d in domnames if d not in fulldomain])))
                mln_.rmf(ft)
        # collect the admissible predicates. a predicate may become inadmissible
        # if either the domain of one of its arguments is empty or there is
        # no formula containing the respective predicate.
        predicates_used = set()
        for _, f in mln_.iterformulas():
            predicates_used.update(f.prednames())
        for predicate in self.iterpreds():
            remove = False
            if any([not dom in fulldomain for dom in predicate.argdoms]):
                logger.debug(
                    'Discarding predicate %s, since it cannot be grounded.' %
                    (predicate.name))
                remove = True
            if predicate.name not in predicates_used:
                logger.debug('Discarding predicate %s, since it is unused.' %
                             predicate.name)
                remove = True
            if remove: del mln_._predicates[predicate.name]
        # permanently transfer domains of variables that were expanded from templates
        for _, ft in mln_.iterformulas():
            domnames = ft.template_variables().values()
            for domname in domnames:
                mln_.domains[domname] = fulldomain[domname]
        # materialize the formula templates
        mln__ = mln_.copy()
        mln__._rmformulas()
        for i, template in mln_.iterformulas():
            for variant in template.template_variants():
                idx = len(mln__._formulas)
                f = mln__.formula(
                    variant,
                    weight=template.weight if isinstance(
                        template.weight, basestring) else template.weight,
                    fixweight=mln_.fixweights[i])
                f.idx = idx
        mln__._materialized = True
        return mln__
Exemplo n.º 8
0
    Takes a sequence of databases and performs a multi-dimensional scaling on
    the synsets given by the merged set of 'concept' domains in them.
    '''
    domains = mergedom(*map(lambda d: d.domains, dbs))
    concepts = domains.get('concept', None)
    if concepts is None:
        logger.error('Domain "concepts" not found in databases.')
        return
    if 'null' in concepts: # remove the null concept
        del concepts[concepts.index('null')]
    synsets = map(lambda x: wn.synset(x), concepts)  # @UndefinedVariable
    distance = zeros((len(synsets),len(synsets)))
    for (i, pointi) in enumerate(synsets):
        for (j, pointj) in enumerate(synsets):
            sys.stdout.write('{:f} / {:f}      \r'.format(i, len(synsets)))
            sim = synsets[i].path_similarity(synsets[j])
            if sim is None: sim = 0
            distance[i,j] = 1. - sim
    Y, eig = doMDS(distance, dimensions=2)
    pylab.figure(1)
    for i, s in enumerate(synsets):
        text(Y[i,0],Y[i,1], s.name, fontsize=8)
    pylab.plot(Y[:,0],Y[:,1],'.')
    pylab.show()
    
if __name__ == '__main__':
    prac = PRAC()
    dbs = list(Database.load(prac.mln, os.path.join('/', 'home', 'nyga', 'work', 'nl_corpora', 'wikihow', 'Slicing.db')))
    domains = mergedom(*map(lambda d: d.domains, dbs))
    concepts = domains.get('concept', None)
Exemplo n.º 9
0
    def materialize(self, *dbs):
        '''
        Materializes this MLN with respect to the databases given. This must
        be called before learning or inference can take place.
        
        Returns a new MLN instance containing expanded formula templates and
        materialized weights. Normally, this method should not be called from the outside. 
        Also takes into account whether or not particular domain values or predictaes
        are actually used in the data, i.e. if a predicate is not used in any
        of the databases, all formulas that make use of this predicate are ignored.

        :param dbs:     list of :class:`database.Database` objects for materialization.
        '''
        logger.debug("materializing formula templates...")

        # obtain full domain with all objects
        fulldomain = mergedom(self.domains, *[db.domains for db in dbs])
        logger.debug('full domains: %s' % fulldomain)

        mln_ = self.copy()

        # collect the admissible formula templates. templates might be not
        # admissible since the domain of a template variable might be empty.
        for ft in list(mln_.formulas):
            domnames = ft.vardoms().values()
            if any([not domname in fulldomain for domname in domnames]):
                logger.debug('Discarding formula template %s, since it cannot be grounded (domain(s) %s empty).' % \
                    (fstr(ft), ','.join([d for d in domnames if d not in fulldomain])))
                mln_.rmf(ft)
        # collect the admissible predicates. a predicate may become inadmissible
        # if either the domain of one of its arguments is empty or there is
        # no formula containing the respective predicate.
        predicates_used = set()
        for _, f in mln_.iterformulas():
            predicates_used.update(f.prednames())
        for predicate in self.iterpreds():
            remove = False
            if any([not dom in fulldomain for dom in predicate.argdoms]):
                logger.debug('Discarding predicate %s, since it cannot be grounded.' % (predicate.name))
                remove = True
            if predicate.name not in predicates_used:
                logger.debug('Discarding predicate %s, since it is unused.' % predicate.name)
                remove = True
            if remove:  del mln_._predicates[predicate.name]
            
        # permanently transfer domains of variables that were expanded from templates
        for _, ft in mln_.iterformulas():
            domnames = ft.template_variables().values()
            for domname in domnames:
                mln_.domains[domname] = fulldomain[domname]

        # materialize the formula templates
        mln__ = mln_.copy()
        mln__ ._rmformulas()
        for i, template in mln_.iterformulas():
            for variant in template.template_variants():
                idx = len(mln__._formulas)
                f = mln__.formula(variant, weight=template.weight if isinstance(template.weight, basestring) else template.weight, 
                                  fixweight=mln_.fixweights[i])
                f.idx = idx
        mln__._materialized = True
        return mln__
Exemplo n.º 10
0
    Takes a sequence of databases and performs a multi-dimensional scaling on
    the synsets given by the merged set of 'concept' domains in them.
    '''
    domains = mergedom(*[d.domains for d in dbs])
    concepts = domains.get('concept', None)
    if concepts is None:
        logger.error('Domain "concepts" not found in databases.')
        return
    if 'null' in concepts: # remove the null concept
        del concepts[concepts.index('null')]
    synsets = [wn.synset(x) for x in concepts]  # @UndefinedVariable
    distance = zeros((len(synsets),len(synsets)))
    for (i, pointi) in enumerate(synsets):
        for (j, pointj) in enumerate(synsets):
            sys.stdout.write('{:f} / {:f}      \r'.format(i, len(synsets)))
            sim = synsets[i].path_similarity(synsets[j])
            if sim is None: sim = 0
            distance[i,j] = 1. - sim
    Y, eig = doMDS(distance, dimensions=2)
    pylab.figure(1)
    for i, s in enumerate(synsets):
        text(Y[i,0],Y[i,1], s.name, fontsize=8)
    pylab.plot(Y[:,0],Y[:,1],'.')
    pylab.show()
    
if __name__ == '__main__':
    prac = PRAC()
    dbs = list(Database.load(prac.mln, os.path.join('/', 'home', 'nyga', 'work', 'nl_corpora', 'wikihow', 'Slicing.db')))
    domains = mergedom(*[d.domains for d in dbs])
    concepts = domains.get('concept', None)