def get_synsets(self, synsetids, ctx=None, **kwargs): """ Get synsets by synsetids """ synsets = SynsetCollection() for sid in synsetids: ss = self.get_synset(sid, ctx=ctx) synsets.add(ss) return synsets
def get_synsets(self, synsetids, lang='eng', ctx=None): ''' Get synsets by synsetids ''' synsets = SynsetCollection() for sid in synsetids: ss = self.get_synset(sid, lang=lang, ctx=ctx) synsets.add(ss) return synsets
def __init__(self, filenames=None, memory_save=False, verbose=False): self.synsets = SynsetCollection() self.memory_save = memory_save self.verbose = verbose if filenames: self.filenames = filenames self.readfiles(filenames)
def results_to_synsets(self, results, synsets=None, ctx=None, **kwargs): if synsets is None: synsets = SynsetCollection() for result in results: ss = self.get_synset(synsetid=result.ID, ctx=ctx, **kwargs) synsets.add(ss) return synsets
def search_ex(self, query, deep_select=True, ignore_case=True, lang='eng', synsets=None, ctx=None, **kwargs): if ignore_case: where = ['lower(def) LIKE ?', 'lang=?'] params = [query.lower(), lang] else: where = ['def LIKE ?', 'lang=?'] params = [query, lang] synsetinfos = ctx.sex.select(' AND '.join(where), params, columns=('synset',)) if synsets is None: synsets = SynsetCollection() for sinfo in synsetinfos: if sinfo.synset not in synsets: synsets.add(self.get_synset(sinfo.synset, lang=lang, ctx=ctx)) return synsets
def search(self, lemma, pos=None, deep_select=True, ignore_case=True, synsets=None, ctx=None, **kwargs): like_phrase = ' LIKE ? ' if '%' in lemma or '_' in lemma: like_phrase = " LIKE ? ESCAPE '@'" lemma = escape_like(lemma) if ignore_case: query = [ 'ID IN (SELECT sid FROM term WHERE lower(term) {})'.format( like_phrase) ] params = [lemma.lower()] else: query = [ 'ID IN (SELECT sid FROM term WHERE term {})'.format( like_phrase) ] params = [lemma] if pos: query.append('pos = ?') params.append(pos) # query synsetids results = ctx.synset.select(' AND '.join(query), params, columns=('ID', )) if deep_select: return self.results_to_synsets(results, ctx=ctx, synsets=synsets) else: return SynsetCollection(synsets=(Synset(x.ID) for x in results))
def get_synset_by_sks(self, sensekeys): synsets = SynsetCollection() with Execution(self.schema) as exe: # synset; where = 'id IN (SELECT sid FROM sensekey where sensekey IN (%s))' % ','.join(['?'] * len(sensekeys)) results = exe.schema.synset.select(where=where, values=sensekeys) if results: return self.results_to_synsets(results, exe, synsets) return synsets
def get_synsets_by_lemma(self, lemma): with Execution(self.schema) as exe: # get synset object rows = exe.schema.wss.select(where='lemma=?', values=(lemma,)) synsets = SynsetCollection() if rows is not None and len(rows) > 0: for row in rows: ss = Synset(row.synsetid) ss.definition = row.definition ss.add_lemma(row.lemma) ss.add_key(row.sensekey) ss.tagcount = row.tagcount # add examples exes = exe.schema.ex.select(where='synsetid=?', values=[row.synsetid], orderby='sampleid') for ex in exes: ss.exes.append(ex.sample) synsets.add(ss) return synsets
def get_synsets_by_ids(self, synsetids): sids = [str(SynsetID.from_string(x).to_gwnsql()) for x in synsetids] synsets = SynsetCollection() with Execution(self.schema) as exe: # synset; wherecon = 'id IN (%s)' % (','.join(['?'] * len(sids))) results = exe.schema.synset.select(where=wherecon, values=sids) if results: return self.results_to_synsets(results, exe, synsets) return synsets
def search(self, lemma, pos=None, deep_select=True, synsets=None, ignore_case=True, ctx=None, **kwargs): # Build query if ignore_case: query = [ 'wordid IN (SELECT wordid FROM words WHERE lower(lemma) LIKE ?)' ] params = [lemma.lower()] else: query = ['wordid IN (SELECT wordid FROM words WHERE lemma LIKE ?)'] params = [lemma] if pos == 'a': # ss_type: https://wordnet.princeton.edu/man/wndb.5WN.html # n NOUN # v VERB # a ADJECTIVE # s ADJECTIVE SATELLITE # r ADVERB query.append( "synsetid IN (SELECT synsetid FROM synsets WHERE pos IN ('a', 's'))" ) elif pos: query.append( 'synsetid IN (SELECT synsetid FROM synsets WHERE pos = ?)') params.append(pos) # find synsetIDs first senses = ctx.senses.select(' AND '.join(query), params, columns=('synsetid', )) # get synset object synsets = SynsetCollection() if senses: for sense in senses: ss = self.get_synset(sense.synsetid, ctx=ctx) synsets.add(ss) return synsets
def all_synsets(self, synsets=None, deep_select=True): synsets = SynsetCollection() with Execution(self.schema) as exe: # synset; results = exe.schema.synset.select() if results: if deep_select: return self.results_to_synsets(results, exe, synsets) else: return results return synsets
def search(self, lemma, pos=None, lang='eng', deep_select=True, ignore_case=True, synsets=None, ctx=None, **kwargs): if ignore_case: wid_filter = ['lower(lemma) LIKE ?', 'lang=?'] params = [lemma.lower(), lang] else: wid_filter = ['lemma LIKE ?', 'lang=?'] params = [lemma, lang] if pos is not None: wid_filter.append('pos = ?') params.append(pos) # ctx is not None query = ['wordid in (SELECT wordid FROM word WHERE {})'.format(' AND '.join(wid_filter))] query.append('lang=?') params.append(lang) senses = ctx.sense.select(' AND '.join(query), params, columns=('synset', 'lang',)) if synsets is None: synsets = SynsetCollection() for sense in senses: if sense.synset not in synsets: synsets.add(self.get_synset(sense.synset, lang=sense.lang, ctx=ctx)) return synsets
def results_to_synsets(self, results, exe, synsets=None): if synsets is None: synsets = SynsetCollection() for result in results: ss = GlossedSynset(result.id) sid = ss.sid.to_gwnsql() # term; terms = exe.schema.term.select(where='sid=?', values=[sid]) for term in terms: ss.add_lemma(term.term) # sensekey; sks = exe.schema.sensekey.select(where='sid=?', values=[sid]) for sk in sks: ss.add_key(sk.sensekey) # gloss_raw | sid cat gloss rgs = exe.schema.gloss_raw.select(where='sid=?', values=[sid]) for rg in rgs: ss.add_raw_gloss(rg.cat, rg.gloss) # gloss; DB: id origid sid cat | OBJ: gid origid cat glosses = exe.schema.gloss.select(where='sid=?', values=[sid]) for gl in glosses: gloss = ss.add_gloss(gl.origid, gl.cat, gl.id) # glossitem; # OBJ | gloss, order, tag, lemma, pos, cat, coll, rdf, origid, sep, text # DB | id ord gid tag lemma pos cat coll rdf sep text origid glossitems = exe.schema.glossitem.select(where='gid=?', values=[gl.id]) item_map = {} for gi in glossitems: item = gloss.add_gloss_item(gi.tag, gi.lemma, gi.pos, gi.cat, gi.coll, gi.rdf, gi.origid, gi.sep, gi.text, gi.id) item_map[item.itemid] = item # sensetag; # OBJ: tagid cat, tag, glob, glemma, gid, coll, origid, sid, sk, lemma # DB: id cat tag glob glob_lemma glob_id coll sid gid sk origid lemma itemid tags = exe.schema.sensetag.select(where='gid=?', values=[gl.id]) for tag in tags: gloss.tag_item(item_map[tag.itemid], tag.cat, tag.tag, tag.glob, tag.glob_lemma, tag.glob_id, tag.coll, tag.origid, tag.sid, tag.sk, tag.lemma, tag.id) synsets.add(ss) return synsets
def get_synsets_by_term(self, term, pos=None, synsets=None, sid_only=False): synsets = SynsetCollection() with Execution(self.schema) as exe: # synset; if pos: results = exe.schema.synset.select(where='pos = ? AND id IN (SELECT sid FROM term where lower(term)=?)', values=[pos, term.lower()]) else: results = exe.schema.synset.select(where='id IN (SELECT sid FROM term where lower(term)=?)', values=[term.lower()]) if results: if sid_only: return results else: return self.results_to_synsets(results, exe, synsets) return synsets
class GWordnetXML: """ GWordNet XML Data Access Object """ def __init__(self, filenames=None, memory_save=False, verbose=False): self.synsets = SynsetCollection() self.memory_save = memory_save self.verbose = verbose if filenames: self.filenames = filenames self.readfiles(filenames) def readfiles(self, files): """ Read from multiple XML files """ for filename in files: self.read(filename) def read(self, filename): """ Read all synsets from an XML file """ logging.info('Loading %s' % filename) with open(filename, 'rb') as infile: tree = etree.iterparse(infile) c = Counter() for event, element in tree: if event == 'end' and element.tag == 'synset': synset = self.parse_synset(element) element.clear() self.synsets.add(synset) # end if end-synset c.count(element.tag) if self.verbose: c.summarise() return self.synsets def parse_synset(self, element): synset = GlossedSynset(element.get('id')) for child in element: if child.tag == 'terms': for grandchild in child: # term is a lemma if grandchild.tag == 'term': synset.add_lemma(StringTool.strip(grandchild.text)) elif child.tag == 'keys': for grandchild in child: if grandchild.tag == 'sk': synset.add_key(StringTool.strip(grandchild.text)) elif child.tag == 'gloss' and child.get( 'desc') == 'orig' and not self.memory_save: if child[0].tag == 'orig': synset.add_raw_gloss(GlossRaw.ORIG, StringTool.strip(child[0].text)) elif child.tag == 'gloss' and child.get( 'desc') == 'text' and not self.memory_save: if child[0].tag == 'text': synset.add_raw_gloss(GlossRaw.TEXT, StringTool.strip(child[0].text)) elif child.tag == 'gloss' and child.get('desc') == 'wsd': for grandchild in child: # [2016-02-12 LTA] aux should be parsed as well # [2017-10-25 LTA] classif = domain if grandchild.tag in ('def', 'ex', 'aux', 'classif'): gloss = synset.add_gloss(origid=grandchild.get('id'), cat=StringTool.strip( grandchild.tag)) self.parse_gloss(grandchild, gloss) # rip definition pass return synset def parse_gloss(self, a_node, gloss): """ Parse a def node or ex node in Gloss WordNet """ # What to be expected in a node? aux/mwf/wf/cf/qf # mwf <- wf | cf # aux <- mwf | qf | wf | cf # qf <- mwf | qf | wf | cf for child_node in a_node: self.parse_node(child_node, gloss) pass def parse_node(self, a_node, gloss): """ Parse node in a def node or an ex node. There are 5 possible tags: wf : single-word form cf : collocation form mwf: multi-word form qf : single- and double-quoted forms aux: auxiliary info """ if a_node.tag == 'wf': return self.parse_wf(a_node, gloss) elif a_node.tag == 'cf': return self.parse_cf(a_node, gloss) elif a_node.tag == 'mwf': return self.parse_mwf(a_node, gloss) elif a_node.tag == 'qf': return self.parse_qf(a_node, gloss) elif a_node.tag == 'aux': return self.parse_aux(a_node, gloss) else: print("WARNING: I don't understand %s tag" % (a_node.tag)) pass def tag_glossitem(self, id_node, glossitem, tag_obj): """ Parse ID element and tag a glossitem """ sk = StringTool.strip(id_node.get('sk')) origid = StringTool.strip(id_node.get('id')) coll = StringTool.strip(id_node.get('coll')) lemma = StringTool.strip(id_node.get('lemma')) if tag_obj is None: tag_obj = glossitem.gloss.tag_item(glossitem, '', '', '', '', '', coll, origid, '', sk, lemma) else: tag_obj.itemid = glossitem.origid tag_obj.sk = sk tag_obj.origid = origid tag_obj.coll = coll tag_obj.lemma = lemma # WEIRD STUFF: lemma="purposefully ignored" sk="purposefully_ignored%0:00:00::" if lemma == 'purposefully ignored' and sk == "purposefully_ignored%0:00:00::": tag_obj.cat = 'PURPOSEFULLY_IGNORED' def get_node_text(self, wf_node): """ Return text value inside an XML node """ if _LXML_AVAILABLE: return StringTool.strip(wf_node.xpath("string()")) else: # TODO: XML mixed content, don't use text attr here return wf_node.text def parse_wf(self, wf_node, gloss): """ Parse a word feature node and then add to gloss object """ tag = wf_node.get('tag') if not self.memory_save else '' lemma = wf_node.get('lemma') if not self.memory_save else '' pos = wf_node.get('pos') cat = wf_node.get('type') # if wf_node.get('type') else 'wf' coll = None # wf_node.get('coll') rdf = wf_node.get('rdf') origid = wf_node.get('id') sep = wf_node.get('sep') text = self.get_node_text(wf_node) wf_obj = gloss.add_gloss_item(tag, lemma, pos, cat, coll, rdf, origid, sep, text, origid) # Then parse id tag if available for child in wf_node: if child.tag == 'id': self.tag_glossitem(child, wf_obj, None) return wf_obj def parse_cf(self, cf_node, gloss): """ Parse a word feature node and then add to gloss object """ tag = cf_node.get('tag') if not self.memory_save else '' lemma = StringTool.strip( cf_node.get('lemma')) if not self.memory_save else '' pos = cf_node.get('pos') cat = cf_node.get('type') # if cf_node.get('type') else 'cf' coll = cf_node.get('coll') rdf = cf_node.get('rdf') origid = cf_node.get('id') sep = cf_node.get('sep') text = self.get_node_text(cf_node) cf_obj = gloss.add_gloss_item(tag, lemma, pos, cat, coll, rdf, origid, sep, text, 'coll:' + coll) # Parse glob info if it's available for child_node in cf_node: if child_node.tag == 'glob': glob_tag = child_node.get('tag') glob_glob = child_node.get('glob') glob_lemma = child_node.get('lemma') glob_coll = child_node.get('coll') glob_id = child_node.get('id') # def tag_item(self, item, cat, tag, glob, glemma, gid, coll, origid, sid, sk, lemma): tag_obj = cf_obj.gloss.tag_item(cf_obj, 'cf', glob_tag, glob_glob, glob_lemma, glob_id, glob_coll, '', '', '', '') for grandchild in child_node: if grandchild.tag == 'id': self.tag_glossitem(grandchild, cf_obj, tag_obj) return cf_obj def parse_mwf(self, mwf_node, gloss): child_nodes = [] for child_node in mwf_node: a_node = self.parse_node(child_node, gloss) # [TODO] Add mwf tag to child nodes def parse_qf(self, qf_node, gloss): child_nodes = [] for child_node in qf_node: a_node = self.parse_node(child_node, gloss) # [TODO] Add qf tag to child nodes def parse_aux(self, aux_node, gloss): child_nodes = [] for child_node in aux_node: a_node = self.parse_node(child_node, gloss)