Exemplo n.º 1
0
def cache_all_synsets(wng_db_loc):
    ''' Cache all Gloss Synset (SQLite) to database
    '''
    t = Timer()
    t.start("Caching synsets")
    db = SQLiteGWordNet(wng_db_loc)
    synsets = db.all_synsets()
    t.end("Done caching")

    db = WSQL(WORDNET_30_PATH)
    t.start("Start caching stuff ...")
    # This should take less than 5 secs to run
    db.cache_all_sensekey()
    #------------------------------------------
    # This should take less than 25 secs to run
    db.cache_all_hypehypo()
    t.end("Done caching!")
Exemplo n.º 2
0
def cache_all_synsets(wng_db_loc):
    ''' Cache all Gloss Synset (SQLite) to database
    '''
    t = Timer()
    t.start("Caching synsets")
    db = SQLiteGWordNet(wng_db_loc)
    synsets = db.all_synsets()
    t.end("Done caching")

    db = WSQL(WORDNET_30_PATH)
    t.start("Start caching stuff ...")
    # This should take less than 5 secs to run
    db.cache_all_sensekey()
    #------------------------------------------
    # This should take less than 25 secs to run
    db.cache_all_hypehypo()
    t.end("Done caching!")
Exemplo n.º 3
0
def xml2db(xml_files, db):
    ''' Convert a XML file of Gloss WordNet into SQLite
    '''
    t = Timer()

    header("Extracting Gloss WordNet (XML)")
    xmlgwn = XMLGWordNet()
    for xml_file in xml_files:
        t.start('Reading file: %s' % xml_file)
        xmlgwn.read(xml_file)
        t.end("Extraction completed %s" % xml_file)

    header("Inserting data into đáng SQLite database")
    t.start()
    db.insert_synsets(xmlgwn.synsets)
    t.end('Insertion completed.')
    pass
Exemplo n.º 4
0
def export_ntumc(wng_loc, wng_db_loc):
    '''
    Export GlossTag to NTU-MC format
    '''
    print("Export GlossTag to NTU-MC")
    merged_folder = os.path.join(wng_loc, 'merged')
    glosstag_ntumc_script = wng_db_loc + ".ntumc.sql"
    
    print("Path to glosstag folder: %s" % (merged_folder))
    print("Path to glosstag DB    : %s" % (wng_db_loc))
    print("Output file            : %s" % (glosstag_ntumc_script))

    gwn = SQLiteGWordNet(wng_db_loc)
    wn = WSQL(WORDNET_30_PATH)

    t = Timer()
    t.start("Retrieving synsets from DB")

    # mockup data
    xml_file = os.path.expanduser('~/wordnet/glosstag/merged/test.xml')
    xmlwn = XMLGWordNet()
    xmlwn.read(xml_file)
    synsets = xmlwn.synsets
#    synsets = gwn.all_synsets()
    print("%s synsets found in %s" % (len(synsets), wng_db_loc))
    t.end()
    t.start("Generating cfrom cto ...")
    with open(glosstag_ntumc_script, 'w') as outfile:
        for ss in synsets:
            sent = ss.raw_glosses[0].gloss
            # print(sent)
            words = []
            for gl in ss.glosses:
                words += [ x.text for x in gl.items ]
            asent = smart_search(sent, words)
            outfile.write("%s\n" % asent.sent)
            for word in asent.words:
                testword = sent[word.cfrom:word.cto]
                if testword != word.text:
                    print("WARNING: Expected [%s] but found [%s]" % (word.text, testword))
                outfile.write("%s [%s:%s] ==> |%s|\n" % (word.text, word.cfrom, word.cto, testword))
    t.end()
    print("Done!")
    
    pass
Exemplo n.º 5
0
def extract_synsets_xml():
    xfile_path = 'data/extract.xml'
    synsets = etree.Element("synsets")
    t = Timer()
    c = Counter()

    # Loop through elements in glosstag xml files
    t.start("Extracting synsets from glosstag ...")
    for xml_file in GLOSSTAG_XML_FILES:
        # for xml_file in [ MOCKUP_SYNSETS_DATA ]:
        tree = etree.iterparse(xml_file)
        for event, element in tree:
            if event == 'end' and element.tag == 'synset':
                # do something to the element
                if to_synsetid(element.get('id')) in SYNSETS_TO_EXTRACT:
                    synsets.append(etree.fromstring(etree.tostring(element)))
                    c.count("FOUND")
                else:
                    c.count("IGNORED")
                # Clean up
                element.clear()
    t.end()
    c.summarise()

    # save the tree (nicely?)
    print("Writing synsets to %s" % (xfile_path, ))
    with open(xfile_path, 'wb') as xfile:
        xfile.write(etree.tostring(synsets, pretty_print=True))
    print("Done!")
Exemplo n.º 6
0
 def test_timer(self):
     rp = TextReport.string()
     t = Timer(report=rp)
     msg = "Do something expensive"
     t.start(msg)
     do_expensive()
     t.stop(msg)
     getLogger().debug(rp.content())
     self.assertIn("Started", rp.content())
     self.assertIn("Stopped", rp.content())
     # test do()
     rp = TextReport.string()
     t = Timer(report=rp)
     t.do(lambda: do_expensive(), desc=msg)
     self.assertIn("Started", rp.content())
     self.assertIn("Stopped", rp.content())
     getLogger().debug(rp.content())
Exemplo n.º 7
0
def convert(args):
    ''' Convert Gloss WordNet XML into SQLite format
    '''
    show_info(args)

    if os.path.isfile(args.glossdb) and os.path.getsize(args.glossdb) > 0:
        print("DB file exists (%s | size: %s)" %
              (args.glossdb, os.path.getsize(args.glossdb)))
        answer = input(
            "If you want to overwrite this file, please type CONFIRM: ")
        if answer != "CONFIRM":
            print("Script aborted!")
            exit()
    db = get_gwn(args)
    header('Importing data from XML to SQLite')
    t = Timer()
    header("Extracting Gloss WordNet (XML)")
    xmlgwn = get_gwnxml(args)
    header("Inserting data into SQLite database")
    t.start()
    db.insert_synsets(xmlgwn.synsets)
    t.end('Insertion completed.')
    pass
Exemplo n.º 8
0
def test_skmap_gwn_wn30():
    ''' Comparing sensekeys between GWN and WN30SQLite
    '''
    gwn = SQLiteGWordNet(wng_db_loc)
    wn = WSQL(WORDNET_30_PATH)

    t = Timer()
    t.start('Caching WN30 sensekey map')
    wnsks = wn.get_all_sensekeys()
    wn_skmap = {}
    wn_sidmap = dd(list)
    # map by sensekeys and synsetid
    for item in wnsks:
        wn_skmap[item.sensekey] = item.synsetid
        wn_sidmap[str(item.synsetid)[1:]].append(item.sensekey)
    t.end("Done WN30")

    t.start('Caching GWN sensekey map')
    gwn_ss = gwn.get_all_sensekeys()
    gwn_skmap = {}
    for item in gwn_ss:
        gwn_skmap[item.sensekey] = item.sid
    t.end("Done GWN")

    t.start('Caching GWN tagged sensekey')
    gwn_tags = gwn.get_all_sensekeys_tagged()
    t.end("Done tagged sensekey")

    print("wn30 sensekeys: %s" % len(wnsks))
    print("gwn synsets   : %s" % len(gwn_ss))
    print("All tagged sk : %s" % len(gwn_tags))

    c = Counter()
    for tag in gwn_tags:
        if tag not in gwn_skmap:
            print("sk [%s] does not exist in GWN" % tag)
            c.count("GWN Not Found")
        else:
            c.count("GWN Found")
        if tag not in wn_skmap:
            if tag in gwn_skmap:
                gwn_sid = gwn_skmap[tag][1:]
                # print("Searching %s" % (gwn_sid))
                if gwn_sid in wn_sidmap:
                    candidates = wn_sidmap[gwn_sid]
                    newsks = set()
                    for cand in candidates:
                        if cand not in gwn_skmap:
                            newsks.add(cand)
                    # print("Found but changed: %s => %s" % (tag, newsks))
                    c.count("WN30 Found derivative")                    
                else:
                    c.count("WN30 Not Found At all")
                    print("sk [%s] does not exist in WN30 at all ..." % tag)    
            else:
                c.count("WN30 & GWN Not Found")
                print("sk [%s] does not exist in WN30" % tag)
        else:
            c.count("WN30 Found")
    c.summarise()
Exemplo n.º 9
0
def main():
    print("Script to compare WNSQL30 to OMW")
    t = Timer()
    t.start("Compare OMW to GWN")
    omw_vs_gwn_def()
    t.end()
Exemplo n.º 10
0
def export_ntumc(wng_loc, wng_db_loc, mockup=False):
    '''
    Export GlossTag to NTU-MC format
    '''
    print("Export GlossTag to NTU-MC")
    merged_folder = os.path.join(wng_loc, 'merged')
    #    glosstag_ntumc_script = wng_db_loc + ".ntumc.sql"
    glosstag_ntumc_script = GLOSSTAG_NTUMC_OUTPUT + ".script.sql"
    sent_file_path = GLOSSTAG_NTUMC_OUTPUT + '_sent.csv'
    word_file_path = GLOSSTAG_NTUMC_OUTPUT + '_word.csv'
    concept_file_path = GLOSSTAG_NTUMC_OUTPUT + '_concept.csv'

    print("Path to glosstag folder: %s" % (merged_folder))
    print("Path to glosstag DB    : %s" % (wng_db_loc))
    print("Output file            : %s" % (glosstag_ntumc_script))

    t = Timer()
    t.start("Retrieving synsets from DB")

    gwn = SQLiteGWordNet(wng_db_loc)
    if mockup:
        synsets = mockup_synsets()
        pass
    else:
        # synsets = gwn.all_synsets()
        wn = WSQL(WORDNET_30_PATH)
        xmlwn = read_xmlwn(GLOSSTAG_XML_FILES)
        synsets = xmlwn.synsets

    print("%s synsets found in %s" % (len(synsets), wng_db_loc))
    t.end()
    t.start("Generating cfrom cto ...")
    with open(glosstag_ntumc_script,
              'w') as outfile, open(sent_file_path, 'w') as sent_file, open(
                  word_file_path,
                  'w') as word_file, open(concept_file_path,
                                          'w') as concept_file:
        outfile.write("""BEGIN TRANSACTION;
   INSERT INTO corpus (corpusID, corpus, title, language)
      VALUES (100, 'misc', "Miscellaneous", "eng"); 
   INSERT INTO doc (docid, doc, title, url, subtitle, corpusID) 
      VALUES(1000, "glosstag", "WordNet with Semantically Tagged Glosses", "http://wordnet.princeton.edu/glosstag.shtml", "", 100);
""")
        sentid = 1000000
        docid = 1000
        glpatch = GlossTagPatch()
        for ss in synsets:
            (ss, sents, glosses, aligned) = prepare_for_ntumc(ss, glpatch)
            # sent = ss.raw_glosses[0].gloss

            # print(sent)
            # [2016-02-01] There is an error in glossitem for synset 01179767-a (a01179767)
            for sent, gl in aligned:
                wordid = 0
                conceptid = 0

                wordid_map = {}
                conceptid_map = {}

                sent_file.write('%s\t%s\n' % (
                    sentid,
                    sent,
                ))
                coll_map = dd(list)
                cwl = []
                CWL = namedtuple("CWL", "cid wid".split())
                words = gl.items
                asent = smart_search(sent, words, lambda x: x.text)
                outfile.write(
                    'INSERT INTO sent (sid,docID,pid,sent,comment,usrname) VALUES(%s,%s,"","%s","[WNSID=%s]","letuananh");\n'
                    % (sentid, docid, asent.sent.replace('"', '""').replace(
                        "'", "''"), ss.get_synsetid()))
                outfile.write('-- WORDS\n')
                for word in asent.words:
                    testword = sent[word.cfrom:word.cto]
                    if testword != word.data.text:
                        print("WARNING: Expected [%s] but found [%s]" %
                              (word.text, testword))
                    outfile.write(
                        'INSERT INTO word (sid, wid, word, pos, lemma, cfrom, cto, comment, usrname) VALUES (%s, %s, "%s", "%s", "%s", %s, %s, "", "letuananh");\n'
                        % (sentid, wordid, word.data.text.replace(
                            '"', '""').replace("'", "''"), word.data.pos,
                           word.data.lemma, word.cfrom, word.cto))
                    wordid_map[wordid] = word.data.origid
                    wordid_map[word.data.origid] = wordid
                    if word.data.coll:
                        coll_map[word.data.coll].append(word.data.origid)
                    word_file.write('%s\t%s\t%s\t%s\t%s\n' %
                                    (sentid, word.data.text, word.cfrom,
                                     word.cto, word.data.lemma))
                    wordid += 1
                outfile.write('-- CONCEPTS\n')
                #for gl in ss.glosses:
                for tag in gl.tags:
                    # tag = synsetid in NTU format (12345678-x)
                    if tag.sk and tag.sk != 'purposefully_ignored%0:00:00::':
                        tagged_ss = gwn.get_synset_by_sk(tag.sk)
                        if not tagged_ss:
                            logger.info("sk[%s] could not be found" % (tag.sk))
                        elif len(tagged_ss) > 1:
                            logger.info("Too many synsets found for sk[%s]" %
                                        (tag.sk))
                        else:
                            # outfile.write("--%s\n" % (tagged_ss[0].get_synsetid(),))
                            outfile.write(
                                'INSERT INTO concept (sid, cid, clemma, tag, tags, comment, ntag, usrname) VALUES (%s, %s, "%s", "", "", "%s", "", "letuananh"); --sk=[%s]\n'
                                %
                                (sentid, conceptid, tag.lemma.replace(
                                    '"', '""').replace("'", "''"),
                                 tagged_ss[0].get_synsetid(), tag.sk))
                        conceptid_map[tag.origid] = conceptid
                        conceptid_map[conceptid] = tag.origid
                        conceptid += 1
                        if tag.coll:
                            # multiword expression
                            for collword in coll_map[tag.coll]:
                                cwl.append(CWL(conceptid,
                                               wordid_map[collword]))
                        elif tag.item:
                            # normal tag
                            cwl.append(
                                CWL(conceptid, wordid_map[tag.item.origid]))
                # outfile.write("/*%s*/\n" % (wordid_map))
                # outfile.write("/*%s*/\n" % (conceptid_map))
                # outfile.write("/*%s*/\n" % coll_map)
                # outfile.write("/*%s*/\n" % cwl)
                outfile.write('-- Concept-Word Links\n')
                for lnk in cwl:
                    outfile.write(
                        'INSERT INTO cwl (sid, wid, cid, usrname) VALUES (%s, %s, %s, "letuananh");\n'
                        % (sentid, lnk.wid, lnk.cid))
                sentid += 1
                outfile.write('\n')
        # end for synsets
        outfile.write("END TRANSACTION;\n")
    t.end()
    print("Done!")

    pass
Exemplo n.º 11
0
def test_alignment(wng_db_loc, mockup=True):
    t = Timer()
    t.start("Cache all SQLite synsets")
    if mockup:
        xmlwn = XMLGWordNet()
        xmlwn.read(MOCKUP_SYNSETS_DATA)
        synsets = xmlwn.synsets
    else:
        logging.info("Using SQLiteGWordNet (%s)" % (WORDNET_30_PATH))
        db = WSQL(WORDNET_30_PATH)
        gwn = SQLiteGWordNet(wng_db_loc)
        synsets = gwn.all_synsets()
    t.end("Done caching")

    c = Counter()
    with open("data/WRONG_SPLIT.txt", 'w') as wrong, open(
            'data/SYNSET_TO_FIX.txt',
            'w') as sslist, open('data/INVALID_ALIGNMENT.txt',
                                 'w') as invalidfile:
        glpatch = GlossTagPatch()
        invalid_synsets = set()
        for ss in synsets:
            orig_glosses = [x.text() for x in ss.glosses]
            (ss, sents, glosses, aligned) = prepare_for_ntumc(ss, glpatch)
            if len(sents) != len(glosses):
                sslist.write("%s\n" % (ss.get_synsetid()))
                wrong.write("[%s] -- %s\n" % (
                    ss.get_synsetid(),
                    ss.raw_glosses[0].gloss,
                ))
                wrong.write("len(sents) = %s\n" % (len(sents)))
                for idx, part in enumerate(sents):
                    wrong.write("    -- %s: %s\n" % (
                        str(idx).rjust(3),
                        part,
                    ))
                wrong.write("len(glosses) = %s\n" % (len(glosses)))
                for idx, gl in enumerate(glosses):
                    wrong.write('    >> %s: %s\n' % (
                        str(idx).rjust(3),
                        gl.items,
                    ))
                wrong.write("len(glosses_orig) = %s\n" % (len(ss.glosses)))
                for idx, gl in enumerate(ss.glosses):
                    wrong.write('    |  %s: %s\n' % (
                        str(idx).rjust(3),
                        gl.items,
                    ))

                c.count("WRONG")
                wrong.write("'%s' : %s\n\n" % (
                    ss.get_synsetid(),
                    sents,
                ))
            else:
                c.count("OK")
            # check word alignment
            invalid = False
            for sent, gl in aligned:
                gltext = ' '.join([x.text for x in gl.items]).replace(';', '')
                if fuzz.ratio(sent, gltext) < 80:
                    print("WARNING [%s]: %s >><< %s" %
                          (ss.get_synsetid(), sent, gltext))
                    invalid = True
            if invalid:
                invalid_synsets.add(ss.get_synsetid())
                invalidfile.write('%s\n' % (ss.get_synsetid(), ))
                invalidfile.write('Split raw gloss : \t%s\n' % (sents, ))
                invalidfile.write('Orig glosses    : \t%s\n' %
                                  (orig_glosses, ))
                invalidfile.write('Combined glosses: \t%s\n--\n\n' %
                                  ([x.text() for x in glosses], ))
        invalidfile.write("\n\ninvalid_synsets=%s" % (invalid_synsets, ))
    c.summarise()
    if c['WRONG'] > 0:
        print(
            "See data/SYNSET_TO_FIX.txt and data/WRONG_SPLIT.txt for more information"
        )
    else:
        print("Everything is OK!")

    print("Done!")
Exemplo n.º 12
0
def test_skmap_gwn_wn30():
    ''' Comparing sensekeys between GWN and WN30SQLite
    '''
    gwn = SQLiteGWordNet(wng_db_loc)
    wn = WSQL(WORDNET_30_PATH)

    t = Timer()
    t.start('Caching WN30 sensekey map')
    wnsks = wn.get_all_sensekeys()
    wn_skmap = {}
    wn_sidmap = dd(list)
    # map by sensekeys and synsetid
    for item in wnsks:
        wn_skmap[item.sensekey] = item.synsetid
        wn_sidmap[str(item.synsetid)[1:]].append(item.sensekey)
    t.end("Done WN30")

    t.start('Caching GWN sensekey map')
    gwn_ss = gwn.get_all_sensekeys()
    gwn_skmap = {}
    for item in gwn_ss:
        gwn_skmap[item.sensekey] = item.sid
    t.end("Done GWN")

    t.start('Caching GWN tagged sensekey')
    gwn_tags = gwn.get_all_sensekeys_tagged()
    t.end("Done tagged sensekey")

    print("wn30 sensekeys: %s" % len(wnsks))
    print("gwn synsets   : %s" % len(gwn_ss))
    print("All tagged sk : %s" % len(gwn_tags))

    c = Counter()
    for tag in gwn_tags:
        if tag not in gwn_skmap:
            print("sk [%s] does not exist in GWN" % tag)
            c.count("GWN Not Found")
        else:
            c.count("GWN Found")
        if tag not in wn_skmap:
            if tag in gwn_skmap:
                gwn_sid = gwn_skmap[tag][1:]
                # print("Searching %s" % (gwn_sid))
                if gwn_sid in wn_sidmap:
                    candidates = wn_sidmap[gwn_sid]
                    newsks = set()
                    for cand in candidates:
                        if cand not in gwn_skmap:
                            newsks.add(cand)
                    # print("Found but changed: %s => %s" % (tag, newsks))
                    c.count("WN30 Found derivative")
                else:
                    c.count("WN30 Not Found At all")
                    print("sk [%s] does not exist in WN30 at all ..." % tag)
            else:
                c.count("WN30 & GWN Not Found")
                print("sk [%s] does not exist in WN30" % tag)
        else:
            c.count("WN30 Found")
    c.summarise()
Exemplo n.º 13
0
def main():
    t = Timer()
    t.start("Script to convert BB to TTL")
    bb2ttl()
    t.end()