def corenlp_to_ttl(cli, args): print("Core NLP output file: {}".format(args.input)) print("TTL file: {}".format(args.output)) print("Source (raw) file: {}".format(args.raw)) cn_sents = json.loads(chio.read_file(args.input))['sentences'] print("Found {} core-nlp sents".format(len(cn_sents))) raw_sents = chio.read_file(args.raw).splitlines() _writer = get_ttl_writer(args.output, ttl_format=args.ttl_format, id_seed=args.seed) for sent_text, cn_sent in zip(raw_sents, cn_sents): ttl_sent = ttl.Sentence(sent_text) ttl_sent.tokens = (cn_tk['originalText'] for cn_tk in cn_sent['tokens']) for ttl_tk, cn_tk in zip(ttl_sent, cn_sent['tokens']): if 'lemma' in cn_tk: ttl_tk.lemma = cn_tk['lemma'] if 'pos' in cn_tk: ttl_tk.pos = cn_tk['pos'] _writer.write_sent(ttl_sent) print("{} sentences was written to {}".format(len(raw_sents), args.output))
def _ensure_config(config_path='~/.jamdict/config.json', mkdir=True): _path = Path(os.path.expanduser(config_path)) # auto create config dir if mkdir: _path.parent.mkdir(exist_ok=True) if not _path.exists(): default_config = read_file(CONFIG_TEMPLATE) logging.getLogger(__name__).warning( f"Jamdict configuration file could not be found. A new configuration file will be generated at {_path}" ) logging.getLogger(__name__).debug(f"Default config: {default_config}") write_file(_path, default_config)
def test_io_with_pathlib(self): print("Make sure that io functions works with pathlib.Path") # test read & write TXT data = [['name', 'foo'], ['age', '18']] json_path = Path(TEST_DATA) / 'temp.json' chio.write_file(json_path, json.dumps(data)) json_data = json.loads(chio.read_file(json_path)) self.assertEqual(json_data, data) # test read & write CSV filepath = Path(TEST_DATA) / 'temp.csv' chio.write_tsv(filepath, data) actual = chio.read_tsv(filepath) self.assertEqual(actual, data)
def txt_to_ttl(cli, args): print("Input file: {}".format(args.input)) print("TTL/{} output: {}".format(args.ttl_format, args.output)) print("With ID column: {}".format(args.with_idcolumn)) raw_sents = chio.read_file(args.input).splitlines() _writer = get_ttl_writer(args.output, ttl_format=args.ttl_format, id_seed=args.seed) for sent in raw_sents: if args.with_idcolumn: sid, text = sent.split('\t', maxsplit=1) _writer.write_sent(ttl.Sentence(text=text, ID=sid)) else: _writer.write_sent(ttl.Sentence(text=text)) print("Written {} sentences to {}".format(len(raw_sents), args.output))
def read_config(): if not __app_config.config and not __app_config.locate_config(): # need to create a config config_dir = os.path.expanduser('~/.jamdict/') if not os.path.exists(config_dir): os.makedirs(config_dir) cfg_loc = os.path.join(config_dir, 'config.json') default_config = read_file(CONFIG_TEMPLATE) getLogger().warning("Jamdict configuration file could not be found. A new configuration file will be generated at {}".format(cfg_loc)) getLogger().debug("Default config: {}".format(default_config)) write_file(cfg_loc, default_config) # read config config = __app_config.config return config
def _build_krad_map(self): with self.lock: lines = chio.read_file(KRADFILE, mode='rt').splitlines() # build the krad map self.__krad_map = {} self.__radk_map = dd(set) for line in lines: if line.startswith("#"): continue else: parts = line.split(':', maxsplit=1) if len(parts) == 2: rads = [r.strip() for r in parts[1].split()] char_literal = parts[0].strip() self.__krad_map[char_literal] = rads for rad in rads: self.__radk_map[rad].add(char_literal)
def patch_sids(cli, args): # rp = TextReport(args.output) if args.output else TextReport() if args.gold: print("Gold MRS file: {}".format(args.gold)) sent_ids = [] if args.idfile: print("ID file: {}".format(args.idfile)) idlines = chio.read_file(args.idfile).splitlines() for line in idlines: idx, text = line.split('\t', maxsplit=1) sent_ids.append((idx, text)) print("Found {} sentences in ID file".format(len(sent_ids))) sents = Document.from_file(args.gold) if sent_ids: if len(sent_ids) != len(sents): print("Wrong sent ID files - Found ID: {} | Found MRS: {}".format(len(sent_ids), len(sents))) print("Verifying sentences' text") for ((sid, stext), mrs_sent) in zip(sent_ids, sents): if stext and stext != mrs_sent.text: print("Invalid sentence text: sentID: {} | {} <> {}".format(sid, stext, mrs_sent.text)) exit() print("Sentences are verified, proceed to patch sent idents") for ((sid, stext), mrs_sent) in zip(sent_ids, sents): mrs_sent.ident = sid if args.both: mrs_sent.ID = sid else: patch_gold_sid(sents) if args.output: print("Sentence idents are patched, writing to output XML file to: {}...".format(args.output)) chio.write_file(args.output, sents.to_xml_str()) else: print(sents.to_xml_str()) print("Done") else: print("No document to patch")
def test_file_rw(self): tmpfile = os.path.join(TEST_DATA, 'test.txt') tmpgzfile = os.path.join(TEST_DATA, 'test.txt.gz') txt = 'ユニコード大丈夫だよ。' txtz = 'This is a zipped text file.' chio.write_file(content=txt, mode='wb', path=tmpfile) # write content as bytes chio.write_file(tmpgzfile, content=txtz) # ensure that tmpgzfile is actually a gzip file with gzip.open(tmpgzfile, mode='rt') as infile: self.assertEqual(infile.read(), txtz) # verify written content self.assertTrue(chio.is_file(tmpfile)) self.assertTrue(chio.is_file(tmpgzfile)) self.assertEqual(chio.read_file(tmpfile), txt) self.assertEqual(chio.read_file(tmpgzfile), txtz) self.assertEqual(chio.read_file(tmpfile, mode='r'), txt) self.assertEqual(chio.read_file(tmpgzfile, mode='r'), txtz) self.assertIsInstance(chio.read_file(tmpfile, mode='rb'), bytes) self.assertIsInstance(chio.read_file(tmpgzfile, mode='rb'), bytes)
def ukb_to_ttl(cli, args): ''' Convert UKB output to TTL ''' doc = read_ttl(args.ttl, ttl_format=args.ttl_format) print("Source TTL file: {} | Sentences: {}".format(args.ttl, len(doc))) token_map = {} if args.tokens: # token file is provided tokens = [list(int(x) for x in line) for line in chio.read_tsv(args.tokens)] for sid, wid, cfrom, cto in tokens: token_map[(sid, wid)] = (cfrom, cto) print("Found tokens: {}".format(len(token_map))) c = Counter() sids = set() input_sids = {int(s.ID) for s in doc} for line_idx, line in enumerate(chio.read_file(args.input).splitlines()): if line.startswith('!! '): continue parts = line.split() if len(parts) != 5: print("WARNING: Invalid line -> {}: {}".format(line_idx, line)) continue sid_text, wid_text, synsetid, unknown, lemma = line.split() sid = int(sid_text) wid = int(wid_text[1:]) sent_obj = doc.get(sid, default=None) if sent_obj is None: print("SID #{} could not be found".format(sid)) elif not token_map and wid >= len(sent_obj): print("Invalid wid: line#{} - sent#{} - wid#{}".format(line_idx, sid, wid)) else: # now can tag ... # remove current concepts if needed # if args.removetags: # cids = list(c.cidx for c in sent_obj.concepts) # for cid in cids: # sent_obj.pop_concept(cid) if not token_map: token = sent_obj[wid] # double check token text if lemma != token.text.lower() and lemma != token.lemma.lower(): print("Invalid token text: {} <> {}/{}".format(lemma, token.text.lower(), token.lemma.lower())) sent_obj.new_concept(synsetid, lemma, tokens=[wid]) else: # create sentence-level tag instead cfrom, cto = token_map[(sid, wid)] sent_obj.new_tag(synsetid, cfrom, cto, tagtype='WN') c.count("Tokens") sids.add(sid) print("UKB sentences: {}".format(len(sids))) print("Not found: {}".format(input_sids.difference(sids))) c.summarise() # removetags if needed if args.removetags: for sent_obj in doc: sent_obj.tags.clear() print("Sent #1 tags: {}".format(len(doc[0].tags))) # baking if not args.tokens: print("Now baking to tags ...") bake_doc(doc) else: print("WARNING: Because token file was provided, no auto-baking will be done") print("Sent #1 tags after baking: {}".format(len(doc[0].tags))) # Now output ... if args.output: print("Output to file ...") _writer = get_ttl_writer(args.output, ttl_format=args.ttl_format, id_seed=args.seed) for sent in doc: _writer.write_sent(sent) print("Written {} sentences to {}".format(len(doc), args.output)) print("Done")