def cache_all_synsets(wng_db_loc): ''' Cache all Gloss Synset (SQLite) to database ''' t = Timer() t.start("Caching synsets") db = SQLiteGWordNet(wng_db_loc) synsets = db.all_synsets() t.end("Done caching") db = WSQL(WORDNET_30_PATH) t.start("Start caching stuff ...") # This should take less than 5 secs to run db.cache_all_sensekey() #------------------------------------------ # This should take less than 25 secs to run db.cache_all_hypehypo() t.end("Done caching!")
def xml2db(xml_files, db): ''' Convert a XML file of Gloss WordNet into SQLite ''' t = Timer() header("Extracting Gloss WordNet (XML)") xmlgwn = XMLGWordNet() for xml_file in xml_files: t.start('Reading file: %s' % xml_file) xmlgwn.read(xml_file) t.end("Extraction completed %s" % xml_file) header("Inserting data into đáng SQLite database") t.start() db.insert_synsets(xmlgwn.synsets) t.end('Insertion completed.') pass
def export_ntumc(wng_loc, wng_db_loc): ''' Export GlossTag to NTU-MC format ''' print("Export GlossTag to NTU-MC") merged_folder = os.path.join(wng_loc, 'merged') glosstag_ntumc_script = wng_db_loc + ".ntumc.sql" print("Path to glosstag folder: %s" % (merged_folder)) print("Path to glosstag DB : %s" % (wng_db_loc)) print("Output file : %s" % (glosstag_ntumc_script)) gwn = SQLiteGWordNet(wng_db_loc) wn = WSQL(WORDNET_30_PATH) t = Timer() t.start("Retrieving synsets from DB") # mockup data xml_file = os.path.expanduser('~/wordnet/glosstag/merged/test.xml') xmlwn = XMLGWordNet() xmlwn.read(xml_file) synsets = xmlwn.synsets # synsets = gwn.all_synsets() print("%s synsets found in %s" % (len(synsets), wng_db_loc)) t.end() t.start("Generating cfrom cto ...") with open(glosstag_ntumc_script, 'w') as outfile: for ss in synsets: sent = ss.raw_glosses[0].gloss # print(sent) words = [] for gl in ss.glosses: words += [ x.text for x in gl.items ] asent = smart_search(sent, words) outfile.write("%s\n" % asent.sent) for word in asent.words: testword = sent[word.cfrom:word.cto] if testword != word.text: print("WARNING: Expected [%s] but found [%s]" % (word.text, testword)) outfile.write("%s [%s:%s] ==> |%s|\n" % (word.text, word.cfrom, word.cto, testword)) t.end() print("Done!") pass
def extract_synsets_xml(): xfile_path = 'data/extract.xml' synsets = etree.Element("synsets") t = Timer() c = Counter() # Loop through elements in glosstag xml files t.start("Extracting synsets from glosstag ...") for xml_file in GLOSSTAG_XML_FILES: # for xml_file in [ MOCKUP_SYNSETS_DATA ]: tree = etree.iterparse(xml_file) for event, element in tree: if event == 'end' and element.tag == 'synset': # do something to the element if to_synsetid(element.get('id')) in SYNSETS_TO_EXTRACT: synsets.append(etree.fromstring(etree.tostring(element))) c.count("FOUND") else: c.count("IGNORED") # Clean up element.clear() t.end() c.summarise() # save the tree (nicely?) print("Writing synsets to %s" % (xfile_path, )) with open(xfile_path, 'wb') as xfile: xfile.write(etree.tostring(synsets, pretty_print=True)) print("Done!")
def test_timer(self): rp = TextReport.string() t = Timer(report=rp) msg = "Do something expensive" t.start(msg) do_expensive() t.stop(msg) getLogger().debug(rp.content()) self.assertIn("Started", rp.content()) self.assertIn("Stopped", rp.content()) # test do() rp = TextReport.string() t = Timer(report=rp) t.do(lambda: do_expensive(), desc=msg) self.assertIn("Started", rp.content()) self.assertIn("Stopped", rp.content()) getLogger().debug(rp.content())
def convert(args): ''' Convert Gloss WordNet XML into SQLite format ''' show_info(args) if os.path.isfile(args.glossdb) and os.path.getsize(args.glossdb) > 0: print("DB file exists (%s | size: %s)" % (args.glossdb, os.path.getsize(args.glossdb))) answer = input( "If you want to overwrite this file, please type CONFIRM: ") if answer != "CONFIRM": print("Script aborted!") exit() db = get_gwn(args) header('Importing data from XML to SQLite') t = Timer() header("Extracting Gloss WordNet (XML)") xmlgwn = get_gwnxml(args) header("Inserting data into SQLite database") t.start() db.insert_synsets(xmlgwn.synsets) t.end('Insertion completed.') pass
def test_skmap_gwn_wn30(): ''' Comparing sensekeys between GWN and WN30SQLite ''' gwn = SQLiteGWordNet(wng_db_loc) wn = WSQL(WORDNET_30_PATH) t = Timer() t.start('Caching WN30 sensekey map') wnsks = wn.get_all_sensekeys() wn_skmap = {} wn_sidmap = dd(list) # map by sensekeys and synsetid for item in wnsks: wn_skmap[item.sensekey] = item.synsetid wn_sidmap[str(item.synsetid)[1:]].append(item.sensekey) t.end("Done WN30") t.start('Caching GWN sensekey map') gwn_ss = gwn.get_all_sensekeys() gwn_skmap = {} for item in gwn_ss: gwn_skmap[item.sensekey] = item.sid t.end("Done GWN") t.start('Caching GWN tagged sensekey') gwn_tags = gwn.get_all_sensekeys_tagged() t.end("Done tagged sensekey") print("wn30 sensekeys: %s" % len(wnsks)) print("gwn synsets : %s" % len(gwn_ss)) print("All tagged sk : %s" % len(gwn_tags)) c = Counter() for tag in gwn_tags: if tag not in gwn_skmap: print("sk [%s] does not exist in GWN" % tag) c.count("GWN Not Found") else: c.count("GWN Found") if tag not in wn_skmap: if tag in gwn_skmap: gwn_sid = gwn_skmap[tag][1:] # print("Searching %s" % (gwn_sid)) if gwn_sid in wn_sidmap: candidates = wn_sidmap[gwn_sid] newsks = set() for cand in candidates: if cand not in gwn_skmap: newsks.add(cand) # print("Found but changed: %s => %s" % (tag, newsks)) c.count("WN30 Found derivative") else: c.count("WN30 Not Found At all") print("sk [%s] does not exist in WN30 at all ..." % tag) else: c.count("WN30 & GWN Not Found") print("sk [%s] does not exist in WN30" % tag) else: c.count("WN30 Found") c.summarise()
def main(): print("Script to compare WNSQL30 to OMW") t = Timer() t.start("Compare OMW to GWN") omw_vs_gwn_def() t.end()
def export_ntumc(wng_loc, wng_db_loc, mockup=False): ''' Export GlossTag to NTU-MC format ''' print("Export GlossTag to NTU-MC") merged_folder = os.path.join(wng_loc, 'merged') # glosstag_ntumc_script = wng_db_loc + ".ntumc.sql" glosstag_ntumc_script = GLOSSTAG_NTUMC_OUTPUT + ".script.sql" sent_file_path = GLOSSTAG_NTUMC_OUTPUT + '_sent.csv' word_file_path = GLOSSTAG_NTUMC_OUTPUT + '_word.csv' concept_file_path = GLOSSTAG_NTUMC_OUTPUT + '_concept.csv' print("Path to glosstag folder: %s" % (merged_folder)) print("Path to glosstag DB : %s" % (wng_db_loc)) print("Output file : %s" % (glosstag_ntumc_script)) t = Timer() t.start("Retrieving synsets from DB") gwn = SQLiteGWordNet(wng_db_loc) if mockup: synsets = mockup_synsets() pass else: # synsets = gwn.all_synsets() wn = WSQL(WORDNET_30_PATH) xmlwn = read_xmlwn(GLOSSTAG_XML_FILES) synsets = xmlwn.synsets print("%s synsets found in %s" % (len(synsets), wng_db_loc)) t.end() t.start("Generating cfrom cto ...") with open(glosstag_ntumc_script, 'w') as outfile, open(sent_file_path, 'w') as sent_file, open( word_file_path, 'w') as word_file, open(concept_file_path, 'w') as concept_file: outfile.write("""BEGIN TRANSACTION; INSERT INTO corpus (corpusID, corpus, title, language) VALUES (100, 'misc', "Miscellaneous", "eng"); INSERT INTO doc (docid, doc, title, url, subtitle, corpusID) VALUES(1000, "glosstag", "WordNet with Semantically Tagged Glosses", "http://wordnet.princeton.edu/glosstag.shtml", "", 100); """) sentid = 1000000 docid = 1000 glpatch = GlossTagPatch() for ss in synsets: (ss, sents, glosses, aligned) = prepare_for_ntumc(ss, glpatch) # sent = ss.raw_glosses[0].gloss # print(sent) # [2016-02-01] There is an error in glossitem for synset 01179767-a (a01179767) for sent, gl in aligned: wordid = 0 conceptid = 0 wordid_map = {} conceptid_map = {} sent_file.write('%s\t%s\n' % ( sentid, sent, )) coll_map = dd(list) cwl = [] CWL = namedtuple("CWL", "cid wid".split()) words = gl.items asent = smart_search(sent, words, lambda x: x.text) outfile.write( 'INSERT INTO sent (sid,docID,pid,sent,comment,usrname) VALUES(%s,%s,"","%s","[WNSID=%s]","letuananh");\n' % (sentid, docid, asent.sent.replace('"', '""').replace( "'", "''"), ss.get_synsetid())) outfile.write('-- WORDS\n') for word in asent.words: testword = sent[word.cfrom:word.cto] if testword != word.data.text: print("WARNING: Expected [%s] but found [%s]" % (word.text, testword)) outfile.write( 'INSERT INTO word (sid, wid, word, pos, lemma, cfrom, cto, comment, usrname) VALUES (%s, %s, "%s", "%s", "%s", %s, %s, "", "letuananh");\n' % (sentid, wordid, word.data.text.replace( '"', '""').replace("'", "''"), word.data.pos, word.data.lemma, word.cfrom, word.cto)) wordid_map[wordid] = word.data.origid wordid_map[word.data.origid] = wordid if word.data.coll: coll_map[word.data.coll].append(word.data.origid) word_file.write('%s\t%s\t%s\t%s\t%s\n' % (sentid, word.data.text, word.cfrom, word.cto, word.data.lemma)) wordid += 1 outfile.write('-- CONCEPTS\n') #for gl in ss.glosses: for tag in gl.tags: # tag = synsetid in NTU format (12345678-x) if tag.sk and tag.sk != 'purposefully_ignored%0:00:00::': tagged_ss = gwn.get_synset_by_sk(tag.sk) if not tagged_ss: logger.info("sk[%s] could not be found" % (tag.sk)) elif len(tagged_ss) > 1: logger.info("Too many synsets found for sk[%s]" % (tag.sk)) else: # outfile.write("--%s\n" % (tagged_ss[0].get_synsetid(),)) outfile.write( 'INSERT INTO concept (sid, cid, clemma, tag, tags, comment, ntag, usrname) VALUES (%s, %s, "%s", "", "", "%s", "", "letuananh"); --sk=[%s]\n' % (sentid, conceptid, tag.lemma.replace( '"', '""').replace("'", "''"), tagged_ss[0].get_synsetid(), tag.sk)) conceptid_map[tag.origid] = conceptid conceptid_map[conceptid] = tag.origid conceptid += 1 if tag.coll: # multiword expression for collword in coll_map[tag.coll]: cwl.append(CWL(conceptid, wordid_map[collword])) elif tag.item: # normal tag cwl.append( CWL(conceptid, wordid_map[tag.item.origid])) # outfile.write("/*%s*/\n" % (wordid_map)) # outfile.write("/*%s*/\n" % (conceptid_map)) # outfile.write("/*%s*/\n" % coll_map) # outfile.write("/*%s*/\n" % cwl) outfile.write('-- Concept-Word Links\n') for lnk in cwl: outfile.write( 'INSERT INTO cwl (sid, wid, cid, usrname) VALUES (%s, %s, %s, "letuananh");\n' % (sentid, lnk.wid, lnk.cid)) sentid += 1 outfile.write('\n') # end for synsets outfile.write("END TRANSACTION;\n") t.end() print("Done!") pass
def test_alignment(wng_db_loc, mockup=True): t = Timer() t.start("Cache all SQLite synsets") if mockup: xmlwn = XMLGWordNet() xmlwn.read(MOCKUP_SYNSETS_DATA) synsets = xmlwn.synsets else: logging.info("Using SQLiteGWordNet (%s)" % (WORDNET_30_PATH)) db = WSQL(WORDNET_30_PATH) gwn = SQLiteGWordNet(wng_db_loc) synsets = gwn.all_synsets() t.end("Done caching") c = Counter() with open("data/WRONG_SPLIT.txt", 'w') as wrong, open( 'data/SYNSET_TO_FIX.txt', 'w') as sslist, open('data/INVALID_ALIGNMENT.txt', 'w') as invalidfile: glpatch = GlossTagPatch() invalid_synsets = set() for ss in synsets: orig_glosses = [x.text() for x in ss.glosses] (ss, sents, glosses, aligned) = prepare_for_ntumc(ss, glpatch) if len(sents) != len(glosses): sslist.write("%s\n" % (ss.get_synsetid())) wrong.write("[%s] -- %s\n" % ( ss.get_synsetid(), ss.raw_glosses[0].gloss, )) wrong.write("len(sents) = %s\n" % (len(sents))) for idx, part in enumerate(sents): wrong.write(" -- %s: %s\n" % ( str(idx).rjust(3), part, )) wrong.write("len(glosses) = %s\n" % (len(glosses))) for idx, gl in enumerate(glosses): wrong.write(' >> %s: %s\n' % ( str(idx).rjust(3), gl.items, )) wrong.write("len(glosses_orig) = %s\n" % (len(ss.glosses))) for idx, gl in enumerate(ss.glosses): wrong.write(' | %s: %s\n' % ( str(idx).rjust(3), gl.items, )) c.count("WRONG") wrong.write("'%s' : %s\n\n" % ( ss.get_synsetid(), sents, )) else: c.count("OK") # check word alignment invalid = False for sent, gl in aligned: gltext = ' '.join([x.text for x in gl.items]).replace(';', '') if fuzz.ratio(sent, gltext) < 80: print("WARNING [%s]: %s >><< %s" % (ss.get_synsetid(), sent, gltext)) invalid = True if invalid: invalid_synsets.add(ss.get_synsetid()) invalidfile.write('%s\n' % (ss.get_synsetid(), )) invalidfile.write('Split raw gloss : \t%s\n' % (sents, )) invalidfile.write('Orig glosses : \t%s\n' % (orig_glosses, )) invalidfile.write('Combined glosses: \t%s\n--\n\n' % ([x.text() for x in glosses], )) invalidfile.write("\n\ninvalid_synsets=%s" % (invalid_synsets, )) c.summarise() if c['WRONG'] > 0: print( "See data/SYNSET_TO_FIX.txt and data/WRONG_SPLIT.txt for more information" ) else: print("Everything is OK!") print("Done!")
def main(): t = Timer() t.start("Script to convert BB to TTL") bb2ttl() t.end()