예제 #1
0
def read_nttat(cli, args):
    ''' Convert NTTAT patch to JSON '''
    stdout = TextReport()
    ext = 'json'
    rp = TextReport("{}_1.{}".format(args.output, ext))
    rp2 = TextReport("{}_2.{}".format(args.output, ext))
    gwn = get_gwn()
    data = []
    with open(args.input, 'r') as infile, gwn.ctx() as ctx:
        ssids = re.findall('\d{8}-[nvarx]', infile.read())
        print(len(ssids))
        print(ssids)
        for sid in ssids:
            ss = gwn.get_synset(sid, ctx=ctx)
            sdef = fix_gwn_def(ss.definition)
            stdout.header(sid, "Lemmas: {}".format(", ".join(ss.lemmas)))
            stdout.print(sdef)
            data.append({
                "synset": sid,
                "lemmas": ss.lemmas,
                "definition": sdef
            })
    cut = int(len(data) / 2)
    # first half
    first_half = json.dumps(data[:cut], indent=2)
    rp.write(first_half)
    # second half
    second_half = json.dumps(data[cut:], indent=2)
    rp2.write(second_half)
예제 #2
0
 def test_get_gloss_synsets(self):
     print("Test get glossed synset(s)")
     db = get_gwn()
     glosses = db.schema.gloss.select()
     # select glosses
     print("Gloss count: {}".format(len(glosses)))
     print(glosses[:5])
     pass
예제 #3
0
 def test_dump_synset(self):
     print("Test get synset by ID")
     gwn = get_gwn()
     ss = get_synset_by_id(gwn, '01775535-v')
     self.assertIsNotNone(ss)
     self.assertGreater(len(ss.lemmas), 0)
     self.assertGreater(len(ss.keys), 0)
     self.assertGreater(len(ss.glosses), 0)
     dump_synset(ss)
     pass
예제 #4
0
def export_gwnsql_synsets(args):
    print(
        "Exporting synsets' info (lemmas/defs/examples) from GlossWordNet (SQLite) to text file"
    )
    show_info(args)
    output_with_sid_file = os.path.abspath('./data/glosstag_lemmas.txt')
    output_without_sid_file = os.path.abspath(
        './data/glosstag_lemmas_noss.txt')
    output_defs = os.path.abspath('./data/glosstag_defs.txt')
    output_exes = os.path.abspath('./data/glosstag_exes.txt')
    gwn = get_gwn(args)

    # Extract synsets' lemmas, definitions and examples
    if args.mockup:
        synsets = get_gwnxml(args).synsets
    else:
        synsets = gwn.all_synsets()

    synsets.synsets.sort(key=lambda x: x.sid.to_canonical())
    with open(output_defs,
              'w') as def_file, open(output_exes, 'w') as ex_file, open(
                  output_with_sid_file,
                  'w') as with_sid, open(output_without_sid_file,
                                         'w') as without_sid:
        # synsets = gwn.get_synsets_by_ids(['01828736-v', '00001740-r'])
        for ss in synsets:
            for t in sorted(ss.terms, key=lambda x: x.term):
                with_sid.write('%s\t%s\n' % (ss.sid.to_canonical(), t.term))
                without_sid.write('%s\n' % (t.term, ))
            for gloss in ss.glosses:
                if gloss.cat == 'def':
                    def_file.write('{sid}\t{d}\n'.format(sid=ss.sid,
                                                         d=gloss.text()))
                elif gloss.cat == 'ex':
                    ex_file.write('{sid}\t{ex}\n'.format(sid=ss.sid,
                                                         ex=gloss.text()))
    # summary
    print("Data has been extracted to:")
    print("  + {}".format(output_with_sid_file))
    print("  + {}".format(output_without_sid_file))
    print("  + {}".format(output_defs))
    print("  + {}".format(output_exes))
    print("Extracted synsets: {}".format(len(synsets)))
    print("Done!")
예제 #5
0
파일: lex2pred.py 프로젝트: letuananh/omwtk
from puchikarui import Schema, with_ctx
from coolisf import GrammarHub
from chirptext.leutile import grouper
from chirptext.io import CSV
from chirptext import TextReport, FileHelper, Counter, FileHub
from chirptext.cli import CLIApp, setup_logging
from yawlib.helpers import get_gwn
from yawlib.helpers import get_wn, get_omw

# -------------------------------------------------------------------------------
# Configuration
# -------------------------------------------------------------------------------

DATA_FOLDER = os.path.abspath(os.path.expanduser('./data'))
omw = get_omw()
gwn = get_gwn()
wn = get_wn()
setup_logging('logging.json', 'logs')
ghub = GrammarHub()
MY_DIR = os.path.dirname(__file__)
SETUP_FILE = os.path.join(MY_DIR, 'scripts', 'ewdb.sql')
ROOTS = {'n': 'root_wn_n', 'v': 'root_wn_v', 'a': 'root_wn_adj', 'r': ''}
DEFAULT_DB_PATH = FileHelper.abspath('data/ewmap.db')


class EWDB(Schema):
    class Flags:
        PROCESSED = 1
        NO_PARSE = 2
        MWE = 3
        MWE_PURE = 100
예제 #6
0
 def test_get_by_sk(self):
     ss = get_synset_by_sk(get_gwn(),
                           'test%2:41:00::',
                           report_file=self.nullrep)
     self.assertIsNotNone(ss)