Exemplo n.º 1
0
def get_synset_by_sk(gwn, sk, report_file=None, compact=True):
    ''' Search synset in WordNet Gloss Corpus by sensekey'''
    if report_file is None:
        report_file = TextReport()  # Default to stdout
    report_file.print("Looking for synsets by sensekey (Provided: %s)\n" % sk)

    synset = gwn.get_synset_by_sk(sk)
    dump_synset(synset, report_file=report_file, compact=compact)
    return synset
Exemplo n.º 2
0
def get_synsets_by_term(gwn, t, pos=None, report_file=None, compact=True):
    ''' Search synset in WordNet Gloss Corpus by term'''
    if report_file is None:
        report_file = TextReport()  # Default to stdout
    report_file.print(
        "Looking for synsets by term (Provided: %s | pos = %s)\n" % (t, pos))

    synsets = gwn.get_synsets_by_term(t, pos)
    dump_synsets(synsets, report_file, compact=compact)
    return synsets
Exemplo n.º 3
0
def get_synsets_by_term(wng_db_loc, t, pos, report_file=None):
    ''' Search synset in WordNet Gloss Corpus by term 
    '''
    if report_file == None:
        report_file = TextReport() # Default to stdout
    report_file.print("Looking for synsets by term (Provided: %s | pos = %s)\n" % (t, pos))

    db = SQLiteGWordNet(wng_db_loc)
    synsets = db.get_synsets_by_term(t, pos)
    dump_synsets(synsets, report_file)
Exemplo n.º 4
0
def get_synset_by_sk(wng_db_loc, sk, report_file=None):
    ''' Search synset in WordNet Gloss Corpus by sensekey 
    '''
    if report_file == None:
        report_file = TextReport() # Default to stdout
    report_file.print("Looking for synsets by sensekey (Provided: %s)\n" % sk)
    
    db = SQLiteGWordNet(wng_db_loc)
    synsets = db.get_synset_by_sk(sk)
    dump_synsets(synsets, report_file)
Exemplo n.º 5
0
def get_synset_by_id(wng_db_loc, synsetid, report_file=None):
    ''' Search synset in WordNet Gloss Corpus by synset ID 
    '''
    if report_file == None:
        report_file = TextReport() # Default to stdout
    report_file.print("Looking for synsets by synsetid (Provided: %s)\n" % synsetid)
 
    db = SQLiteGWordNet(wng_db_loc)
    if len(synsetid) == 10 and synsetid[8] == '-':
        synsetid = synsetid[9] + synsetid[:8]
    synsets = db.get_synset_by_id(synsetid)
    dump_synsets(synsets, report_file)
Exemplo n.º 6
0
def omw_vs_gwn():
    rp = TextReport("data/omw_new.txt")
    omw_ssids = set(get_omw_synsets())
    gwn_ssids = set(get_gwn_synsets())
    omw_new = omw_ssids - gwn_ssids
    for ss in omw_new:
        s = omw.get_synset(ss)
        lemma = s.lemma if s.lemma else "** no lex **"
        if len(s.definitions) == 0:
            rp.print(join("\t", s.ID, lemma, "** no def **"))
        elif len(s.definitions) == 1:
            rp.print(join("\t", s.ID, lemma, s.definition))
        else:
            rp.print(join("\t", s.ID, lemma, s.definitions[0]))
            for d in s.definitions[1:]:
                rp.print("\t\t" + d)
Exemplo n.º 7
0
def dump_synsets(synsets, report_file=None):
    ''' Dump a SynsetCollection to stdout

    Arguments:
        synsets     -- List of synsets to dump
        report_file -- An instance of TextReport
    '''
    if report_file == None:
        report_file = TextReport() # Default to stdout

    if synsets:
        for synset in synsets:
            dump_synset(synset, report_file=report_file)
        report_file.print("Found %s synset(s)" % synsets.count())
    else:
        report_file.print("None was found!")
Exemplo n.º 8
0
class TestHelperMethods(unittest.TestCase):

    nullrep = TextReport('/dev/null')

    def test_dump_synset(self):
        print("Test get synset by ID")
        gwn = get_gwn()
        ss = get_synset_by_id(gwn, '01775535-v')
        self.assertIsNotNone(ss)
        self.assertGreater(len(ss.lemmas), 0)
        self.assertGreater(len(ss.keys), 0)
        self.assertGreater(len(ss.glosses), 0)
        dump_synset(ss)
        pass

    def test_dump_synsets(self):
        dump_synsets(None)

    def test_get_by_term(self):
        sses = get_synsets_by_term(GWNSQL(YLConfig.GWN30_DB),
                                   'test',
                                   report_file=self.nullrep)
        self.assertEqual(len(sses), 13)

    def test_get_by_sk(self):
        ss = get_synset_by_sk(get_gwn(),
                              'test%2:41:00::',
                              report_file=self.nullrep)
        self.assertIsNotNone(ss)
Exemplo n.º 9
0
 def test_timer(self):
     rp = TextReport.string()
     t = Timer(report=rp)
     msg = "Do something expensive"
     t.start(msg)
     do_expensive()
     t.stop(msg)
     getLogger().debug(rp.content())
     self.assertIn("Started", rp.content())
     self.assertIn("Stopped", rp.content())
     # test do()
     rp = TextReport.string()
     t = Timer(report=rp)
     t.do(lambda: do_expensive(), desc=msg)
     self.assertIn("Started", rp.content())
     self.assertIn("Stopped", rp.content())
     getLogger().debug(rp.content())
Exemplo n.º 10
0
def get_synset_by_id(gwn, synsetid_str, report_file=None, compact=True):
    ''' Search synset in WordNet Gloss Corpus by synset ID'''
    if report_file is None:
        report_file = TextReport()  # Default to stdout
    report_file.print("Looking for synsets by synsetid (Provided: %s)\n" %
                      synsetid_str)

    # Get synset infro from GlossWordnet
    try:
        synsetid = SynsetID.from_string(synsetid_str)
        synset = gwn.get_synset_by_id(synsetid.to_gwnsql())
        dump_synset(synset, report_file=report_file, compact=compact)
        return synset
    except Exception as e:
        logger.exception(e)
        logger.error(
            "  >>>> Error: (Synset ID should be in this format 12345678-n)")
Exemplo n.º 11
0
 def test_textreport(self):
     with TextReport.null() as rp:
         rp.writeline("null")
         rp.writeline(123)
         self.assertEqual(rp.content(), '')
     with TextReport() as rp:
         rp.writeline("stdout")
         rp.writeline(123)
         self.assertEqual(rp.content(), '')
     with TextReport(os.path.join(TEST_DATA, "del.me")) as rp:
         rp.writeline("ABC")
         rp.writeline(123)
         self.assertEqual(rp.content(), '')
     self.assertTrue(rp.closed)
     # test string report
     with TextReport.string() as rp:
         rp.writeline("ABC")
         rp.writeline(123, 456, 789)
         self.assertEqual(rp.content(), 'ABC\n123 456 789\n')
Exemplo n.º 12
0
 def test_locate_config_file(self):
     cfg = AppConfig(name='foo', mode=AppConfig.JSON)
     actual = cfg.potentials()
     expected = ['./.foo', './.foo.json', './foo', './foo.json', './data/foo', './data/foo.json', './data/.foo', './data/.foo.json', '~/.foo', '~/.foo.json', '~/.foo/config', '~/.foo.json/config', '~/.foo/config.json', '~/.foo.json/config.json', '~/.config/foo', '~/.config/foo.json', '~/.config/.foo', '~/.config/.foo.json', '~/.config/foo/config', '~/.config/foo.json/config', '~/.config/foo/config.json', '~/.config/foo.json/config.json', '~/.config/foo/foo', '~/.config/foo.json/foo.json']
     self.assertEqual(actual, expected)
     # default mode is INI
     cfg_ini = AppConfig('chirptest', working_dir=os.path.dirname(__file__))
     self.assertEqual(cfg_ini.config.sections(), ['AUTHOR'])
     self.assertEqual(cfg_ini.config['DEFAULT']['package'], 'chirptext.test')
     self.assertEqual(cfg_ini.config['DEFAULT']['tester'], 'unittest')
     self.assertEqual(cfg_ini.config['AUTHOR']['name'], 'Le Tuan Anh')
     self.assertEqual(cfg_ini.config['AUTHOR']['tester'], 'unittest')
     self.assertEqual(cfg_ini.config.get('AUTHOR', 'desc', fallback='nothing'), 'nothing')
     # test writing config
     with TextReport.string() as strfile:
         cfg_ini.config['AUTHOR']['desc'] = 'An author'
         cfg_ini.config.write(strfile.file)
         self.assertIn('desc = An author', strfile.content())
Exemplo n.º 13
0
def dump_synsets(synsets, report_file=None, compact=True):
    ''' Dump a SynsetCollection to stdout

    Arguments:
        synsets     -- List of synsets to dump
        report_file -- An instance of TextReport
    '''
    if report_file is None:
        report_file = TextReport()  # Default to stdout

    if synsets is not None:
        for synset in synsets:
            dump_synset(synset, report_file=report_file, compact=compact)
        report_file.print("Found %s synset(s)" % synsets.count())
    else:
        report_file.print("None was found!")
Exemplo n.º 14
0
def gen_stats(corpus_file, report_path=None):
	''' Generate statistics for a text corpus (word count, most frequent words, etc.)
	'''
	report = TextReport(report_path)
	report.header("Stat for %s" % corpus_file)
	line_count = -1
	word_count = 0
	c = Counter()
	with open(corpus_file, 'r', encoding='utf8') as infile:
		lines = infile.readlines()
		line_count = len(lines)
		for line in lines:
			tokens = line.split()
			for token in tokens:
				parts = token.split("/")
				if len(parts) == 2:
					word = parts[0]
					POS  = parts[1]
				else:
					word = parts[0]
					POS  = None
				for spechar in SPECIAL_CHARS:
					word = word.replace(spechar, '')
				word = word.lower().replace("_", " ") # original word form
				if word == '':
					print(token)
				c.count(word)
				word_count += 1
	
	report.writeline("Line count: %s" % line_count)
	report.writeline("Word count: %s" % word_count)
	report.writeline("Word class: %s" % len(c.sorted_by_count()))
	report.writeline("Top %d    :" % TOP_K)
	for item in c.sorted_by_count()[:TOP_K]:
		report.writeline("%s: %s" % (item[0], item[1]), level=1)
	report.writeline("Bottom %d :" % TOP_K)
	for item in c.sorted_by_count()[-TOP_K:]:
		report.writeline("%s: %s" % (item[0], item[1]), level=1)
	report.writeline("-" * 80)
	for item in c.group_by_count():
		report.writeline("%s: %s" % (item[0], ', '.join(DaoPhay.vn_sorted(item[1]))))
Exemplo n.º 15
0
def dump_synset(ss,
                compact_gloss=False,
                compact_tags=False,
                more_compact=True,
                report_file=None,
                compact=True):
    ''' Print synset details for debugging purpose

    Arguments:
        ss            -- Synset object to dump
        compact_gloss -- Don't dump gloss tokens' details
        compact_tags  -- Don't dump tagged senses' details
        more_compact  -- Don't dump full details of synset
        report_file   -- Report file to write to

    '''
    if report_file is None:
        report_file = TextReport()  # Default to stdout

    if more_compact:
        report_file.header(
            "Synset: %s (lemmas=%s | keys=%s)" %
            (ss.sid.to_canonical(), ss.lemmas, ss.keys), 'h0')
    else:
        report_file.header("Synset: %s" % ss, 'h0')

    for rgloss in ss.raw_glosses:
        if more_compact:
            if rgloss.cat != 'orig':
                continue
        report_file.print(rgloss)

    gloss_count = itertools.count(1)
    for gloss in ss.glosses:
        if compact:
            report_file.print("({cat}) {txt}".format(cat=gloss.cat,
                                                     txt=gloss.text()))
        else:
            report_file.print('')
            report_file.header("Gloss #%s: %s" % (next(gloss_count), gloss),
                               'h2')

            # Dump gloss items
            if compact_gloss:
                report_file.print("Tokens => %s" % gloss.get_gramwords(),
                                  level=2)
            else:
                for item in gloss.items:
                    # print("\t%s - { %s }" % (uniquify(item.get_gramwords()), item))
                    report_file.print("%s - { %s }" %
                                      (set(item.get_gramwords()), item),
                                      level=2)
                report_file.print(("-" * 10), level=1)
            # Dump tags
            if compact_tags:
                report_file.print("Tags => %s" % gloss.get_tagged_sensekey(),
                                  level=2)
            else:
                for tag in gloss.tags:
                    report_file.print("%s" % tag, level=1)
    report_file.print('')
Exemplo n.º 16
0
def gen_stats(corpus_file, report_path=None):
    ''' Generate statistics for a text corpus (word count, most frequent words, etc.)
    '''
    report = TextReport(report_path)
    report.header("Stat for %s" % corpus_file)
    line_count = -1
    word_count = 0
    c = Counter()
    with open(corpus_file, 'r', encoding='utf8') as infile:
        lines = infile.readlines()
        line_count = len(lines)
        for line in lines:
            tokens = line.split()
            for token in tokens:
                parts = token.split("/")
                if len(parts) == 2:
                    word = parts[0]
                    POS = parts[1]
                else:
                    word = parts[0]
                    POS = None
                for spechar in SPECIAL_CHARS:
                    word = word.replace(spechar, '')
                word = word.lower().replace("_", " ")  # original word form
                if word == '':
                    print(token)
                c.count(word)
                word_count += 1
    report.writeline("Line count: %s" % line_count)
    report.writeline("Word count: %s" % word_count)
    report.writeline("Word class: %s" % len(c.sorted_by_count()))
    report.writeline("Top %d    :" % TOP_K)
    for item in c.sorted_by_count()[:TOP_K]:
        report.writeline("%s: %s" % (item[0], item[1]), level=1)
    report.writeline("Bottom %d :" % TOP_K)
    for item in c.sorted_by_count()[-TOP_K:]:
        report.writeline("%s: %s" % (item[0], item[1]), level=1)
    report.writeline("-" * 80)
    for item in c.group_by_count():
        report.writeline("%s: %s" %
                         (item[0], ', '.join(DaoPhay.vn_sorted(item[1]))))
Exemplo n.º 17
0
def omw_vs_gwn_def():
    rp = TextReport("data/omw_gwn_report.txt")
    rpdiff = TextReport("data/omw_gwn_diff.txt")
    rptypo = TextReport("data/omw_gwn_typo.txt")
    rpsn = TextReport("data/omw_gwn_sciname.txt")
    c = Counter(TAGS.ORDER)

    # ssids to compare
    diff_ssids = []
    ssids = read_diff_ssids()
    if not ssids:
        print("Generating synset ID list")
        omw_ssids = set(get_omw_synsets())
        gwn_ssids = set(get_gwn_synsets())
        # only care about old GWN synsets
        ssids = omw_ssids.intersection(gwn_ssids)
    else:
        print("Comparing {} synsets loaded from {}".format(len(ssids), ssid_filepath))
    lang = 'eng'
    with omw.ctx() as omw_ctx, gwn.ctx() as gwn_ctx:
        print("Comparing {} synsets".format(len(ssids)))
        for ss in list(ssids):
            ss = str(ss)
            c.count("total")
            tags, odef, gdef = compare_synset(omw, gwn, ss, omw_ctx, gwn_ctx)
            omwss = omw.get_synset(ss, ctx=omw_ctx)
            tags_str = ' '.join('[{}]'.format(t.upper()) for t in tags)
            if TAGS.DIFF in tags:
                diff_ssids.append(ss)
                # [FCB] why did we change?
                gwnss = gwn.get_synset(ss, ctx=gwn_ctx)
                glosses = gwn_ctx.gloss.select('surface = ?', (gwnss.definition,))
                if glosses and len(glosses) > 1:
                    tags.add(TAGS.DUP)
                    ssids = [str(SynsetID.from_string(g.sid)) for g in glosses]
                    reason = "Not unique (Shared among {}) so OMW team changed it".format(' '.join(ssids))
                else:
                    tags.add(TAGS.OMW)
                    defs = omw.sdef.select('synset=? and lang=?', (ss, lang))
                    usrs = {d.usr for d in defs if d.usr}
                    usrs_str = ', '.join(usrs) if usrs else "someone in NTU"
                    reason = "{} made this change.".format(usrs_str)
                tags_str = ' '.join('[{}]'.format(t.upper()) for t in tags)
                rpdiff.header("{} {}".format(tags_str, ss))
                rpdiff.print("OMW: {}".format(omwss.definition))
                rpdiff.print("GWN: {}".format(gdef))
                rpdiff.print("Reason: {}".format(reason))
            if TAGS.SCINAME in tags:
                rpsn.header("{} {}".format(tags_str, ss))
                rpsn.print("OMW: {}".format(omwss.definition))
                rpsn.print("GWN: {}".format(gdef))
            if TAGS.REP in tags or TAGS.TYPO in tags:
                rptypo.header("{} {}".format(tags_str, ss))
                rptypo.print("OMW: {}".format(omwss.definition))
                rptypo.print("GWN: {}".format(gdef))
            # master report
            for tag in tags:
                c.count(tag)
            if not tags:
                c.count(TAGS.IDENT)
            rp.header("{} {}".format(tags_str, ss))
            rp.print("OMW: {}".format(omwss.definition))
            rp.print("GWN: {}".format(gdef))

    # done
    c.summarise(report=rp)
    with open('data/omw_gwn_diff_ssids.txt', 'wt') as diff_ssid_file:
        for ss in diff_ssids:
            diff_ssid_file.write('{}\n'.format(ss))
Exemplo n.º 18
0
def dump_synset(ss, compact_gloss=False, compact_tags=False, more_compact=True, report_file=None):
    ''' Print synset details for debugging purpose

    Arguments:
        ss            -- Synset object to dump
        compact_gloss -- Don't dump gloss tokens' details
        compact_tags  -- Don't dump tagged senses' details
        more_compact  -- Don't dump full details of synset
        report_file   -- Report file to write to

    '''
    if report_file == None:
        report_file = TextReport() # Default to stdout
    
    if more_compact:
        report_file.header("Synset: %s (terms=%s | keys=%s)" % (ss.get_synsetid(), ss.terms, ss.keys), 'h0')
    else:
        report_file.header("Synset: %s" % ss, 'h0')

    for rgloss in ss.raw_glosses:
        if more_compact:
            if rgloss.cat != 'orig':
                continue
        report_file.print(rgloss)

    gloss_count = itertools.count(1)
    for gloss in ss.glosses:
        print()
        report_file.header("Gloss #%s: %s" % (next(gloss_count), gloss), 'h2')

        # Dump gloss items
        if compact_gloss:
            report_file.print("Tokens => %s" % gloss.get_gramwords(), level=2)
        else:
            for item in gloss.items:
                # print("\t%s - { %s }" % (uniquify(item.get_gramwords()), item))
                report_file.print("%s - { %s }" % (set(item.get_gramwords()), item), level=2)
            report_file.print(("-" * 10), level=1)
        
        # Dump tags
        if compact_tags:
            report_file.print("Tags => %s" % gloss.get_tagged_sensekey(), level=2)
        else:
            for tag in gloss.tags:
                report_file.print("%s" % tag, level=1)
    report_file.print('')
Exemplo n.º 19
0
def do_expensive(n=10000):
    s = TextReport.string()
    for i in range(n):
        s.print("This is string number #{}".format(i))
    return str(s.content())