Пример #1
0
 def get_synsets(self, synsetids, ctx=None, **kwargs):
     """ Get synsets by synsetids """
     synsets = SynsetCollection()
     for sid in synsetids:
         ss = self.get_synset(sid, ctx=ctx)
         synsets.add(ss)
     return synsets
Пример #2
0
 def get_synsets(self, synsetids, lang='eng', ctx=None):
     ''' Get synsets by synsetids '''
     synsets = SynsetCollection()
     for sid in synsetids:
         ss = self.get_synset(sid, lang=lang, ctx=ctx)
         synsets.add(ss)
     return synsets
Пример #3
0
 def __init__(self, filenames=None, memory_save=False, verbose=False):
     self.synsets = SynsetCollection()
     self.memory_save = memory_save
     self.verbose = verbose
     if filenames:
         self.filenames = filenames
         self.readfiles(filenames)
Пример #4
0
 def results_to_synsets(self, results, synsets=None, ctx=None, **kwargs):
     if synsets is None:
         synsets = SynsetCollection()
     for result in results:
         ss = self.get_synset(synsetid=result.ID, ctx=ctx, **kwargs)
         synsets.add(ss)
     return synsets
Пример #5
0
 def search_ex(self, query, deep_select=True, ignore_case=True, lang='eng', synsets=None, ctx=None, **kwargs):
     if ignore_case:
         where = ['lower(def) LIKE ?', 'lang=?']
         params = [query.lower(), lang]
     else:
         where = ['def LIKE ?', 'lang=?']
         params = [query, lang]
     synsetinfos = ctx.sex.select(' AND '.join(where), params, columns=('synset',))
     if synsets is None:
         synsets = SynsetCollection()
     for sinfo in synsetinfos:
         if sinfo.synset not in synsets:
             synsets.add(self.get_synset(sinfo.synset, lang=lang, ctx=ctx))
     return synsets
Пример #6
0
 def search(self,
            lemma,
            pos=None,
            deep_select=True,
            ignore_case=True,
            synsets=None,
            ctx=None,
            **kwargs):
     like_phrase = ' LIKE ? '
     if '%' in lemma or '_' in lemma:
         like_phrase = " LIKE ? ESCAPE '@'"
         lemma = escape_like(lemma)
     if ignore_case:
         query = [
             'ID IN (SELECT sid FROM term WHERE lower(term) {})'.format(
                 like_phrase)
         ]
         params = [lemma.lower()]
     else:
         query = [
             'ID IN (SELECT sid FROM term WHERE term {})'.format(
                 like_phrase)
         ]
         params = [lemma]
     if pos:
         query.append('pos = ?')
         params.append(pos)
     # query synsetids
     results = ctx.synset.select(' AND '.join(query),
                                 params,
                                 columns=('ID', ))
     if deep_select:
         return self.results_to_synsets(results, ctx=ctx, synsets=synsets)
     else:
         return SynsetCollection(synsets=(Synset(x.ID) for x in results))
Пример #7
0
 def get_synset_by_sks(self, sensekeys):
     synsets = SynsetCollection()
     with Execution(self.schema) as exe:
         # synset;
         where = 'id IN (SELECT sid FROM sensekey where sensekey IN (%s))' % ','.join(['?'] * len(sensekeys))
         results = exe.schema.synset.select(where=where, values=sensekeys)
         if results:
             return self.results_to_synsets(results, exe, synsets)
     return synsets
Пример #8
0
 def get_synsets_by_lemma(self, lemma):
     with Execution(self.schema) as exe:
         # get synset object
         rows = exe.schema.wss.select(where='lemma=?', values=(lemma,))
         synsets = SynsetCollection()
         if rows is not None and len(rows) > 0:
             for row in rows:
                 ss = Synset(row.synsetid)
                 ss.definition = row.definition
                 ss.add_lemma(row.lemma)
                 ss.add_key(row.sensekey)
                 ss.tagcount = row.tagcount
                 # add examples
                 exes = exe.schema.ex.select(where='synsetid=?', values=[row.synsetid], orderby='sampleid')
                 for ex in exes:
                     ss.exes.append(ex.sample)
                 synsets.add(ss)
         return synsets
Пример #9
0
 def get_synsets_by_ids(self, synsetids):
     sids = [str(SynsetID.from_string(x).to_gwnsql()) for x in synsetids]
     synsets = SynsetCollection()
     with Execution(self.schema) as exe:
         # synset;
         wherecon = 'id IN (%s)' % (','.join(['?'] * len(sids)))
         results = exe.schema.synset.select(where=wherecon, values=sids)
         if results:
             return self.results_to_synsets(results, exe, synsets)
     return synsets
Пример #10
0
 def search(self,
            lemma,
            pos=None,
            deep_select=True,
            synsets=None,
            ignore_case=True,
            ctx=None,
            **kwargs):
     # Build query
     if ignore_case:
         query = [
             'wordid IN (SELECT wordid FROM words WHERE lower(lemma) LIKE ?)'
         ]
         params = [lemma.lower()]
     else:
         query = ['wordid IN (SELECT wordid FROM words WHERE lemma LIKE ?)']
         params = [lemma]
     if pos == 'a':
         # ss_type: https://wordnet.princeton.edu/man/wndb.5WN.html
         # n    NOUN
         # v    VERB
         # a    ADJECTIVE
         # s    ADJECTIVE SATELLITE
         # r    ADVERB
         query.append(
             "synsetid IN (SELECT synsetid FROM synsets WHERE pos IN ('a', 's'))"
         )
     elif pos:
         query.append(
             'synsetid IN (SELECT synsetid FROM synsets WHERE pos = ?)')
         params.append(pos)
     # find synsetIDs first
     senses = ctx.senses.select(' AND '.join(query),
                                params,
                                columns=('synsetid', ))
     # get synset object
     synsets = SynsetCollection()
     if senses:
         for sense in senses:
             ss = self.get_synset(sense.synsetid, ctx=ctx)
             synsets.add(ss)
     return synsets
Пример #11
0
 def all_synsets(self, synsets=None, deep_select=True):
     synsets = SynsetCollection()
     with Execution(self.schema) as exe:
         # synset;
         results = exe.schema.synset.select()
         if results:
             if deep_select:
                 return self.results_to_synsets(results, exe, synsets)
             else:
                 return results
     return synsets
Пример #12
0
 def search(self, lemma, pos=None, lang='eng', deep_select=True, ignore_case=True, synsets=None, ctx=None, **kwargs):
     if ignore_case:
         wid_filter = ['lower(lemma) LIKE ?', 'lang=?']
         params = [lemma.lower(), lang]
     else:
         wid_filter = ['lemma LIKE ?', 'lang=?']
         params = [lemma, lang]
     if pos is not None:
         wid_filter.append('pos = ?')
         params.append(pos)
     # ctx is not None
     query = ['wordid in (SELECT wordid FROM word WHERE {})'.format(' AND '.join(wid_filter))]
     query.append('lang=?')
     params.append(lang)
     senses = ctx.sense.select(' AND '.join(query), params, columns=('synset', 'lang',))
     if synsets is None:
         synsets = SynsetCollection()
     for sense in senses:
         if sense.synset not in synsets:
             synsets.add(self.get_synset(sense.synset, lang=sense.lang, ctx=ctx))
     return synsets
Пример #13
0
 def results_to_synsets(self, results, exe, synsets=None):
     if synsets is None:
         synsets = SynsetCollection()
     for result in results:
         ss = GlossedSynset(result.id)
         sid = ss.sid.to_gwnsql()
         # term;
         terms = exe.schema.term.select(where='sid=?', values=[sid])
         for term in terms:
             ss.add_lemma(term.term)
         # sensekey;
         sks = exe.schema.sensekey.select(where='sid=?', values=[sid])
         for sk in sks:
             ss.add_key(sk.sensekey)
         # gloss_raw | sid cat gloss
         rgs = exe.schema.gloss_raw.select(where='sid=?', values=[sid])
         for rg in rgs:
             ss.add_raw_gloss(rg.cat, rg.gloss)
         # gloss; DB: id origid sid cat | OBJ: gid origid cat
         glosses = exe.schema.gloss.select(where='sid=?', values=[sid])
         for gl in glosses:
             gloss = ss.add_gloss(gl.origid, gl.cat, gl.id)
             # glossitem;
             # OBJ | gloss, order, tag, lemma, pos, cat, coll, rdf, origid, sep, text
             # DB  | id ord gid tag lemma pos cat coll rdf sep text origid
             glossitems = exe.schema.glossitem.select(where='gid=?', values=[gl.id])
             item_map = {}
             for gi in glossitems:
                 item = gloss.add_gloss_item(gi.tag, gi.lemma, gi.pos, gi.cat, gi.coll, gi.rdf, gi.origid, gi.sep, gi.text, gi.id)
                 item_map[item.itemid] = item
             # sensetag;
             # OBJ: tagid cat, tag, glob, glemma, gid, coll, origid, sid, sk, lemma
             # DB: id cat tag glob glob_lemma glob_id coll sid gid sk origid lemma itemid
             tags = exe.schema.sensetag.select(where='gid=?', values=[gl.id])
             for tag in tags:
                 gloss.tag_item(item_map[tag.itemid], tag.cat, tag.tag, tag.glob, tag.glob_lemma,
                                tag.glob_id, tag.coll, tag.origid, tag.sid, tag.sk, tag.lemma, tag.id)
         synsets.add(ss)
     return synsets
Пример #14
0
 def get_synsets_by_term(self, term, pos=None, synsets=None, sid_only=False):
     synsets = SynsetCollection()
     with Execution(self.schema) as exe:
         # synset;
         if pos:
             results = exe.schema.synset.select(where='pos = ? AND id IN (SELECT sid FROM term where lower(term)=?)', values=[pos, term.lower()])
         else:
             results = exe.schema.synset.select(where='id IN (SELECT sid FROM term where lower(term)=?)', values=[term.lower()])
         if results:
             if sid_only:
                 return results
             else:
                 return self.results_to_synsets(results, exe, synsets)
     return synsets
Пример #15
0
class GWordnetXML:
    """ GWordNet XML Data Access Object
    """
    def __init__(self, filenames=None, memory_save=False, verbose=False):
        self.synsets = SynsetCollection()
        self.memory_save = memory_save
        self.verbose = verbose
        if filenames:
            self.filenames = filenames
            self.readfiles(filenames)

    def readfiles(self, files):
        """ Read from multiple XML files
        """
        for filename in files:
            self.read(filename)

    def read(self, filename):
        """ Read all synsets from an XML file
        """
        logging.info('Loading %s' % filename)
        with open(filename, 'rb') as infile:
            tree = etree.iterparse(infile)
            c = Counter()
            for event, element in tree:
                if event == 'end' and element.tag == 'synset':
                    synset = self.parse_synset(element)
                    element.clear()
                    self.synsets.add(synset)
                # end if end-synset
                c.count(element.tag)

            if self.verbose:
                c.summarise()
            return self.synsets

    def parse_synset(self, element):
        synset = GlossedSynset(element.get('id'))
        for child in element:
            if child.tag == 'terms':
                for grandchild in child:
                    # term is a lemma
                    if grandchild.tag == 'term':
                        synset.add_lemma(StringTool.strip(grandchild.text))
            elif child.tag == 'keys':
                for grandchild in child:
                    if grandchild.tag == 'sk':
                        synset.add_key(StringTool.strip(grandchild.text))
            elif child.tag == 'gloss' and child.get(
                    'desc') == 'orig' and not self.memory_save:
                if child[0].tag == 'orig':
                    synset.add_raw_gloss(GlossRaw.ORIG,
                                         StringTool.strip(child[0].text))
            elif child.tag == 'gloss' and child.get(
                    'desc') == 'text' and not self.memory_save:
                if child[0].tag == 'text':
                    synset.add_raw_gloss(GlossRaw.TEXT,
                                         StringTool.strip(child[0].text))
            elif child.tag == 'gloss' and child.get('desc') == 'wsd':
                for grandchild in child:
                    # [2016-02-12 LTA] aux should be parsed as well
                    # [2017-10-25 LTA] classif = domain
                    if grandchild.tag in ('def', 'ex', 'aux', 'classif'):
                        gloss = synset.add_gloss(origid=grandchild.get('id'),
                                                 cat=StringTool.strip(
                                                     grandchild.tag))
                        self.parse_gloss(grandchild, gloss)
                        # rip definition
                        pass
        return synset

    def parse_gloss(self, a_node, gloss):
        """ Parse a def node or ex node in Gloss WordNet
        """
        # What to be expected in a node? aux/mwf/wf/cf/qf
        # mwf <- wf | cf
        # aux <- mwf | qf | wf | cf
        # qf <- mwf | qf | wf | cf
        for child_node in a_node:
            self.parse_node(child_node, gloss)
        pass

    def parse_node(self, a_node, gloss):
        """ Parse node in a def node or an ex node.
            There are 5 possible tags:
            wf : single-word form
            cf : collocation form
            mwf: multi-word form
            qf : single- and double-quoted forms
            aux: auxiliary info
        """
        if a_node.tag == 'wf':
            return self.parse_wf(a_node, gloss)
        elif a_node.tag == 'cf':
            return self.parse_cf(a_node, gloss)
        elif a_node.tag == 'mwf':
            return self.parse_mwf(a_node, gloss)
        elif a_node.tag == 'qf':
            return self.parse_qf(a_node, gloss)
        elif a_node.tag == 'aux':
            return self.parse_aux(a_node, gloss)
        else:
            print("WARNING: I don't understand %s tag" % (a_node.tag))
        pass

    def tag_glossitem(self, id_node, glossitem, tag_obj):
        """ Parse ID element and tag a glossitem
        """
        sk = StringTool.strip(id_node.get('sk'))
        origid = StringTool.strip(id_node.get('id'))
        coll = StringTool.strip(id_node.get('coll'))
        lemma = StringTool.strip(id_node.get('lemma'))

        if tag_obj is None:
            tag_obj = glossitem.gloss.tag_item(glossitem, '', '', '', '', '',
                                               coll, origid, '', sk, lemma)
        else:
            tag_obj.itemid = glossitem.origid
            tag_obj.sk = sk
            tag_obj.origid = origid
            tag_obj.coll = coll
            tag_obj.lemma = lemma

        # WEIRD STUFF: lemma="purposefully ignored" sk="purposefully_ignored%0:00:00::"
        if lemma == 'purposefully ignored' and sk == "purposefully_ignored%0:00:00::":
            tag_obj.cat = 'PURPOSEFULLY_IGNORED'

    def get_node_text(self, wf_node):
        """ Return text value inside an XML node """
        if _LXML_AVAILABLE:
            return StringTool.strip(wf_node.xpath("string()"))
        else:
            # TODO: XML mixed content, don't use text attr here
            return wf_node.text

    def parse_wf(self, wf_node, gloss):
        """ Parse a word feature node and then add to gloss object
        """
        tag = wf_node.get('tag') if not self.memory_save else ''
        lemma = wf_node.get('lemma') if not self.memory_save else ''
        pos = wf_node.get('pos')
        cat = wf_node.get('type')  # if wf_node.get('type') else 'wf'
        coll = None  # wf_node.get('coll')
        rdf = wf_node.get('rdf')
        origid = wf_node.get('id')
        sep = wf_node.get('sep')
        text = self.get_node_text(wf_node)
        wf_obj = gloss.add_gloss_item(tag, lemma, pos, cat, coll, rdf, origid,
                                      sep, text, origid)
        # Then parse id tag if available
        for child in wf_node:
            if child.tag == 'id':
                self.tag_glossitem(child, wf_obj, None)
        return wf_obj

    def parse_cf(self, cf_node, gloss):
        """ Parse a word feature node and then add to gloss object
        """
        tag = cf_node.get('tag') if not self.memory_save else ''
        lemma = StringTool.strip(
            cf_node.get('lemma')) if not self.memory_save else ''
        pos = cf_node.get('pos')
        cat = cf_node.get('type')  # if cf_node.get('type') else 'cf'
        coll = cf_node.get('coll')
        rdf = cf_node.get('rdf')
        origid = cf_node.get('id')
        sep = cf_node.get('sep')
        text = self.get_node_text(cf_node)
        cf_obj = gloss.add_gloss_item(tag, lemma, pos, cat, coll, rdf, origid,
                                      sep, text, 'coll:' + coll)
        # Parse glob info if it's available
        for child_node in cf_node:
            if child_node.tag == 'glob':
                glob_tag = child_node.get('tag')
                glob_glob = child_node.get('glob')
                glob_lemma = child_node.get('lemma')
                glob_coll = child_node.get('coll')
                glob_id = child_node.get('id')
                #            def tag_item(self, item,   cat,  tag,      glob,      glemma,     gid,     coll,      origid, sid, sk, lemma):
                tag_obj = cf_obj.gloss.tag_item(cf_obj, 'cf', glob_tag,
                                                glob_glob, glob_lemma, glob_id,
                                                glob_coll, '', '', '', '')
                for grandchild in child_node:
                    if grandchild.tag == 'id':
                        self.tag_glossitem(grandchild, cf_obj, tag_obj)
        return cf_obj

    def parse_mwf(self, mwf_node, gloss):
        child_nodes = []
        for child_node in mwf_node:
            a_node = self.parse_node(child_node, gloss)
        # [TODO] Add mwf tag to child nodes

    def parse_qf(self, qf_node, gloss):
        child_nodes = []
        for child_node in qf_node:
            a_node = self.parse_node(child_node, gloss)
        # [TODO] Add qf tag to child nodes

    def parse_aux(self, aux_node, gloss):
        child_nodes = []
        for child_node in aux_node:
            a_node = self.parse_node(child_node, gloss)