def make_derivations(daemon): global pypath, projdir, datapath, idsrch allfiles = [] esrlpath = os.path.join(projdir, 'data', 'ldc', daemon, 'ccgbank') if not os.path.exists(esrlpath): os.makedirs(esrlpath) progress = 0 svc = grpc.CcgParserService(daemon) stub = svc.open_client() failed_total = 0 ldcpath = os.path.join(projdir, 'data', 'ldc', 'ccgbank_1_1', 'data', 'RAW') dirlist = os.listdir(ldcpath) try: for fname in dirlist: ldcpath1 = os.path.join(ldcpath, fname) with open(ldcpath1, 'r') as fd: lines = fd.readlines() m = idsrch.match(os.path.basename(ldcpath1)) if m is None: continue derivations = [] failed_parse = [] for ln in lines: # Parse with EasySRL via gRPC try: ccg = grpc.ccg_parse(stub, ln) derivations.append(safe_utf8_encode(ccg.replace('\n', ''))) except Exception as e: failed_parse.append(safe_utf8_encode(ln.strip())) # Add comment so line numbers match id's derivations.append( safe_utf8_encode('# FAILED: ' + ln.strip())) progress = print_progress(progress, 10) id = m.group('id') if len(derivations) != 0: with open(os.path.join(esrlpath, 'ccg_derivation%s.txt' % id), 'w') as fd: fd.write(b'\n'.join(derivations)) failed_total += len(failed_parse) if len(failed_parse) != 0: with open(os.path.join(esrlpath, 'ccg_failed%s.txt' % id), 'w') as fd: fd.write(b'\n'.join(failed_parse)) finally: print_progress(progress, 10, done=True) svc.shutdown() if failed_total != 0: print('THERE WERE %d PARSE FAILURES' % failed_total)
def save(self, stream): self.modify_lock.acquire() try: stream.write(b'%d:%d\n' % (self.max_edit_distance, self.longest_word_length)) for k, v in self.dictionary.iteritems(): stream.write(safe_utf8_encode(k)) stream.write(b':') stream.write(safe_utf8_encode(str(v[1]))) stream.write(b':') stream.write(safe_utf8_encode(':'.join(v[0]))) stream.write(b'\n') finally: self.modify_lock.release()
def __init__(self, daemon, workdir=None, jarfile=None, extra_args=None, debug=False): """Create a CCG Parse Service. Args: daemon: 'easysrl' or 'neuralccg'. workdir: Optional path to daemon if in release mode. """ global _logger, _GRPC_RUNNING self.workdir = safe_utf8_encode(workdir) if workdir else os.getcwd() self.grpc_stop_onclose = False self.daemon_name = safe_utf8_encode(daemon) self.child = None extra_args = None if extra_args is None else [safe_utf8_encode(a) for a in extra_args] try: # Check if easyxxx service has started. If not start it. self.grpc_stub, _ = get_client_transport('localhost', self.daemon_port) ccg_parse(self.grpc_stub, '') except Exception: # Not started _logger.info('Starting %s gRPC daemon', self.daemon_name) if USE_DEVEL_PATH and jarfile is None: cmdline = [os.path.join(PROJDIR, 'scripts', 'start_server.sh'), daemon] if extra_args is not None: cmdline.extend(extra_args) subprocess.call(cmdline) time.sleep(self._WAIT_TIME) # Give it some time to lock session access elif jarfile is not None: log_file = os.path.join(workdir, self.daemon_name + '.log') if debug: cmdline = ['/usr/bin/java', '-Dlog4j.debug', '-jar', jarfile, '--daemonize'] else: cmdline = ['/usr/bin/java', '-jar', jarfile, '--daemonize'] if extra_args is not None: cmdline.extend(extra_args) _logger.debug(cmdline) if debug: self.child = subprocess.Popen(cmdline) else: self.child = subprocess.Popen(cmdline, stdout=open('/dev/null', 'w'), stderr=open('/dev/null', 'w')) time.sleep(self._WAIT_TIME) os.kill(self.child.pid, 0) self.grpc_stop_onclose = True _logger.info('started child daemon with pid %d', self.child.pid) else: raise ValueError('CcgParserService.__init__()') _GRPC_RUNNING.add(self) self.stub, _ = get_client_transport('localhost', self.daemon_port) # Call asynchronously - will wait until default session is created ccg_parse(self.stub, '', timeout=120) self.grpc_stop_onclose = True
def ccg_parse(client, sentence, session_id=DEFAULT_SESSION, timeout=0): """Parse the sentence using the specified session. Args: client: The client end-point stub returned from get_client_transport() sentence: The sentence. Can be unicode, utf-8, or ascii. session_id: Optional session id. timeout: If non-zero make the call asynchronously with timeout equal to this value. Typically not needed unless the call may timeout when run synchronously. Returns: The response message string . """ isUnicode = isinstance(sentence, unicode) if isUnicode: # CCG Parser is Java so input must be utf-8 or ascii sentence = sentence.encode('utf-8') query_input = create_query_input('text', sentence) request = Request() request.LUCID = session_id request.spec.name = 'infer' request.spec.content.extend([query_input]) if timeout <= 0: response = client.infer(request) else: infer_future = client.infer.future(request, timeout) # FIXME: Need to add error reporting to Response structure. response = infer_future.result() if future_string == unicode: isUnicode = True if isinstance(response.msg, unicode): return response.msg if isUnicode else safe_utf8_encode(response.msg) return response.msg if not isUnicode else safe_utf8_decode(response.msg)
def build_from_model(fn_dict, outdir, modelPath, verbose=False, verify=True): print('Building function templates from model folder...') fname = os.path.join(modelPath, 'markedup') if not os.path.exists(fname) or not os.path.isfile(fname): print('Error: %s does not exist or is not a file' % fname) with open(fname, 'r') as fd: signatures = fd.readlines() failed_rules = [] progress = 0 for sig in signatures: predarg = Category(sig.strip()) progress = print_progress(progress, 1000) try: catkey = predarg.clean(True) template = FunctorTemplate.create_from_category(predarg) if template is None: continue if verify: f = template.create_empty_functor() U1 = f.get_unify_scopes(False) U2 = f.category.extract_unify_atoms(False) if len(U1) != len(U2): assert False C1 = f.category C2 = template.predarg_category.clean(True) if not C1.can_unify(C2): assert False if catkey.signature not in fn_dict: fn_dict[catkey.signature] = template elif verify: f1 = fn_dict[catkey.signature] t1 = str(f1) t2 = str(template) assert t1 == t2, 'verify failed\n t1=%s\n t2=%s\n f1=%s\n f2=%s' % (t1, t2, f1.predarg_category, predarg) except Exception as e: failed_rules.append(safe_utf8_encode('%s: %s' % (predarg, e))) # DEBUG ? if False: try: FunctorTemplate.create_from_category(predarg) except Exception: pass print_progress(progress, done=True) if len(failed_rules) != 0: print('Warning: model - %d rules failed' % len(failed_rules)) with open(os.path.join(outdir, 'functor_easysrl_templates_failed.dat'), 'w') as fd: fd.write(b'\n'.join(failed_rules)) if verbose: for m in failed_rules: print(m) return fn_dict
def make_s3_name(cls, text): global _ALPHANUM text = text.lower() if future_string == unicode: return '-'.join(filter(lambda y: len(y) != 0, _NALNUMSP.sub('', text).split(' '))) if isinstance(text, unicode): text = text.encode('utf-8') result = '-'.join(filter(lambda y: len(y) != 0, _NALNUMSP.sub('', text).split(' '))) # PWG: don't know why this happens but if text contains unicode # it is converted automatically return safe_utf8_encode(result)
def strip_apostrophe_s(word): """Strip trailing 's from nouns. Args: word: An ascii or utf-8 string. Returns: The stripped word. """ # Must support utf-8 if len(word) > 2: if word.endswith("'s"): return word[0:-2] elif isinstance(word, unicode): if word.endswith(u"’s"): return word.replace(u"’s", u'') else: uword = safe_utf8_decode(word) if uword.endswith(u"’s"): return safe_utf8_encode(uword.replace(u"’s", u'')) return word
def build_from_ldc_ccgbank(fn_dict, outdir, verbose=False, verify=True): print('Building function templates from LDC ccgbank...') allfiles = [] ldcpath = os.path.join(projdir, 'data', 'ldc', 'ccgbank_1_1', 'data', 'AUTO') dirlist1 = os.listdir(ldcpath) for dir1 in dirlist1: ldcpath1 = os.path.join(ldcpath, dir1) if os.path.isdir(ldcpath1): dirlist2 = os.listdir(ldcpath1) for dir2 in dirlist2: ldcpath2 = os.path.join(ldcpath1, dir2) if os.path.isfile(ldcpath2): allfiles.append(ldcpath2) failed_parse = [] failed_rules = [] rules = [] progress = 0 for fn in allfiles: progress = print_progress(progress, 10) with open(fn, 'r') as fd: lines = fd.readlines() for hdr,ccgbank in zip(lines[0::2], lines[1::2]): pt = None try: pt = parse_ccg_derivation(ccgbank) extract_predarg_categories_from_pt(pt, rules) except Exception as e: failed_parse.append(safe_utf8_encode('CCGBANK: ' + ccgbank.strip())) failed_parse.append(safe_utf8_encode('Error: %s' % e)) # Now attempt to track undefined unary rules if pt is not None: try: builder = Ccg2Drs() builder.build_execution_sequence(pt) # Calling this will track undefined builder.get_predarg_ccgbank() except Exception as e: pass progress = (progress / 10) * 1000 for predarg in rules: progress = print_progress(progress, 1000) try: catkey = predarg.clean(True) template = FunctorTemplate.create_from_category(predarg) if template is None: continue if catkey.signature not in fn_dict: fn_dict[catkey.signature] = template elif verify: f1 = fn_dict[catkey.signature] t1 = future_string(f1) t2 = future_string(template) assert t1 == t2, 'verify failed\n t1=%s\n t2=%s\n f1=%s\n f2=%s' % (t1, t2, f1.predarg_category, predarg) except Exception as e: failed_rules.append(safe_utf8_encode('%s: %s' % (predarg, e))) # DEBUG ? if False: try: FunctorTemplate.create_from_category(predarg) except Exception: pass print_progress(progress, done=True) if len(failed_parse) != 0: print('Warning: ldc - %d parses failed' % (len(failed_parse)/2)) with open(os.path.join(outdir, 'parse_ccg_derivation_failed.dat'), 'w') as fd: fd.write(b'\n'.join(failed_parse)) if verbose: for x, m in failed_parse: print(m) if len(failed_rules) != 0: print('Warning: ldc - %d rules failed' % len(failed_rules)) with open(os.path.join(outdir, 'functor_ldc_templates_failed.dat'), 'w') as fd: fd.write(b'\n'.join(failed_rules)) if verbose: for m in failed_rules: print(m) return fn_dict
def __str__(self): return safe_utf8_encode(self._get_str())
if orphaned: sys.stdout.write('<orphaned>\n') sys.stdout.write(orphaned) sys.stdout.write('\n</orphaned>\n') if conjoins: sys.stdout.write('<conjoins>\n') sys.stdout.write(conjoins) sys.stdout.write('\n</conjoins>\n') if functor_phrases: sys.stdout.write('<functor_phrases>\n') sys.stdout.write(functor_phrases) sys.stdout.write('\n</functor_phrases>\n') else: with open(outfile, 'w') as fd: if html: fd.write(safe_utf8_encode(html)) fd.write(b'\n') if ccg: fd.write(b'<ccg>\n') fd.write(safe_utf8_encode(ccg.strip())) fd.write(b'\n</ccg>\n') if pccg: fd.write(b'<predarg>\n') fd.write(safe_utf8_encode(pccg)) fd.write(b'\n</predarg>\n') if drs: fd.write(b'<drs>\n') fd.write(drs) fd.write(b'\n</drs>\n') if fol: fd.write(b'<fol>\n')
'AUTO') outpath = os.path.join(projdir, 'data', 'ldc', 'mapping') if not os.path.exists(outpath): os.makedirs(outpath) dirlist1 = os.listdir(ldcpath) for dir1 in dirlist1: ldcpath1 = os.path.join(ldcpath, dir1) if os.path.isdir(ldcpath1): dirlist2 = os.listdir(ldcpath1) mapping = [] for dir2 in dirlist2: ldcpath2 = os.path.join(ldcpath1, dir2) wsjnm, _ = os.path.splitext(dir2) if os.path.isfile(ldcpath2): id = 1 missing = False with open(ldcpath2, 'r') as fd: lines = fd.readlines() for hdr, ccgbank in zip(lines[0::2], lines[1::2]): hdrid = hdr.split(' ')[0][3:].strip() expected_hdrid = '%s.%d' % (wsjnm, id) if not missing and hdrid != expected_hdrid: missing = True print('missing entry, expected %s, actual %s' % (expected_hdrid, hdrid)) mapping.append(safe_utf8_encode(hdrid)) id += 1 with open(os.path.join(outpath, 'ccg_map%s.txt' % dir1), 'w') as fd: fd.write(b'\n'.join(mapping))
def make_lexicon(daemon): global pypath, projdir, datapath, idsrch allfiles = [] projdir = os.path.dirname(os.path.dirname(__file__)) easysrl_path = os.path.join(projdir, 'data', 'ldc', daemon, 'lexicon') if not os.path.exists(easysrl_path): os.makedirs(easysrl_path) if not os.path.exists(os.path.join(easysrl_path, 'rt')): os.makedirs(os.path.join(easysrl_path, 'rt')) if not os.path.exists(os.path.join(easysrl_path, 'az')): os.makedirs(os.path.join(easysrl_path, 'az')) # Get files ldcpath = os.path.join(projdir, 'data', 'ldc', daemon, 'ccgbank') dirlist1 = sorted(os.listdir(ldcpath)) #dirlist1 = ['ccg_derivation00.txt'] for fname in dirlist1: if 'ccg_derivation' not in fname: continue ldcpath1 = os.path.join(ldcpath, fname) if os.path.isfile(ldcpath1): allfiles.append(ldcpath1) failed_parse = 0 failed_ccg_derivation = [] start = 0 progress = -1 dictionary = None for fn in allfiles: idx = idsrch.match(fn) if idx is None: continue idx = idx.group('id') with open(fn, 'r') as fd: lines = fd.readlines() name, _ = os.path.splitext(os.path.basename(fn)) for i in range(start, len(lines)): start = 0 ccgbank = lines[i].strip() if len(ccgbank) == 0 or ccgbank[0] == '#': continue if progress < 0: print('%s-%04d' % (name, i)) else: progress = print_progress(progress, 10) try: # CCG parser is Java so output is UTF-8. ccgbank = safe_utf8_decode(ccgbank) pt = parse_ccg_derivation(ccgbank) s = sentence_from_pt(pt).strip() except Exception: failed_parse += 1 raise continue uid = '%s-%04d' % (idx, i) try: #dictionary[0-25][stem][set([c]), set(uid)] dictionary = extract_lexicon_from_pt(pt, dictionary, uid=uid) except Exception as e: print(e) raise continue rtdict = {} for idx in range(len(dictionary)): fname = unichr(idx + 0x40) filepath = os.path.join(easysrl_path, 'az', fname + '.txt') with open(filepath, 'w') as fd: d = dictionary[idx] for k, v in d.iteritems(): # k == stem, v = {c: set(uid)} fd.write(b'<predicate name=\'%s\'>\n' % safe_utf8_encode(k)) for x, w in v.iteritems(): fd.write(b'<usage \'%s\'>\n' % safe_utf8_encode(x)) nc = x.split(':') if len(nc) == 2: c = Category.from_cache( Category(nc[1].strip()).clean(True)) # Return type atom rt = c.extract_unify_atoms(False)[-1] if rt in rtdict: cdict = rtdict[rt] if c in cdict: cdict[c].append(nc[0]) else: cdict[c] = [nc[0]] else: rtdict[rt] = {c: [nc[0]]} for y in w: fd.write(b'sentence id: ' + safe_utf8_encode(y)) fd.write(b'\n') fd.write(b'</usage>\n') fd.write(b'</predicate>\n\n') # Free up memory dictionary[idx] = None d = None for rt, cdict in rtdict.iteritems(): fname = rt.signature.replace('[', '_').replace(']', '') filepath = os.path.join(easysrl_path, 'rt', fname + '.txt') with open(filepath, 'w') as fd: for c, vs in cdict.iteritems(): fd.write(b'<category signature=\'%s\'>\n' % safe_utf8_encode(c)) for v in vs: fd.write(v) fd.write(b'\n') fd.write(b'</category>\n\n')
def __repr__(self): if self.drs: return b'<Lexeme>:(%s, %s, %s)' % (safe_utf8_encode( self.word), self.drs, self.category) return b'<Lexeme>:(%s, %s, %s)' % (safe_utf8_encode( self.word), self.stem, self.category)
def __str__(self): return safe_utf8_encode(self.to_string())
def get_aws_s3_names(self, article_text): """Get the s3 name for the article. Returns: A tuple containing the s3 bucket and object-name for the article. """ # FIXME: move to __future__ global _DOM, _ALPHANUM m = _DOM.match(self.entry.link) assert m is not None if future_string == unicode: dom = m.group('domain').replace('.', '-') name = self.make_s3_name(self.entry.title) dt = self.get_date() dtYM = '{:%Y-%m}'.format(dt) dtD = '{:%d}'.format(dt)[::-1] h = hashlib.md5() language = self.feed.language.lower() if hasattr(self.feed, 'language') else 'en-us' h.update(safe_utf8_encode(language)) h.update(safe_utf8_encode(dom)) h.update(safe_utf8_encode(name)) h.update(safe_utf8_encode(article_text)) h = h.hexdigest() feedtitle = self.make_s3_name(self.feed.title) if hasattr(self.feed, 'title') else 'unknown' return 'marbles-ai-feeds-%s-%s' % (language, dtYM), '%s/%s/%s/%s/%s' % (dtD, dom, feedtitle, name, h) dom = safe_utf8_encode(m.group('domain').replace('.', '-')) name = self.make_s3_name(self.entry.title) dt = self.get_date() dtYM = safe_utf8_encode('{:%Y-%m}'.format(dt)) dtD = safe_utf8_encode('{:%d}'.format(dt)[::-1]) h = safe_utf8_encode(hashlib.md5()) article_text = safe_utf8_encode(article_text) name = safe_utf8_encode(name) # FIXME: use geo-location on domain to infer language language = safe_utf8_encode(self.feed.language.lower()) if hasattr(self.feed, 'language') else 'en-us' h.update(language) h.update(dom) h.update(name) h.update(article_text) h = h.hexdigest() feedtitle = self.make_s3_name(self.feed.title) if hasattr(self.feed, 'title') else 'unknown' return 'marbles-ai-feeds-%s-%s' % (language, dtYM), '%s/%s/%s/%s/%s' % (dtD, dom, feedtitle, name, h)
def make_drs(daemon): global pypath, projdir, datapath, idsrch allfiles = [] projdir = os.path.dirname(os.path.dirname(__file__)) easysrl_path = os.path.join(projdir, 'data', 'ldc', daemon, 'drs') if not os.path.exists(easysrl_path): os.makedirs(easysrl_path) # Get files ldcpath = os.path.join(projdir, 'data', 'ldc', daemon, 'ccgbank') dirlist1 = os.listdir(ldcpath) for fname in dirlist1: if 'ccg_derivation' not in fname: continue ldcpath1 = os.path.join(ldcpath, fname) if os.path.isfile(ldcpath1): allfiles.append(ldcpath1) failed_parse = 0 failed_ccg2drs = [] start = 0 progress = -1 for fn in allfiles: idx = idsrch.match(fn) if idx is None: continue idx = idx.group('id') if not os.path.exists(os.path.join(easysrl_path, idx)): os.mkdir(os.path.join(easysrl_path, idx)) with open(fn, 'r') as fd: lines = fd.readlines() name, _ = os.path.splitext(os.path.basename(fn)) for i in range(start, len(lines)): start = 0 ccgbank = lines[i].strip() if len(ccgbank) == 0 or ccgbank[0] == '#': continue if progress < 0: print('%s-%04d' % (name, i)) else: progress = print_progress(progress, 10) try: # CCG parser is Java so output is UTF-8. pt = parse_ccg_derivation(ccgbank) s = sentence_from_pt(pt).strip() pccg = pt_to_ccg_derivation(pt) except Exception: failed_parse += 1 raise continue try: d = process_ccg_pt( pt, CO_VERIFY_SIGNATURES | CO_NO_VERBNET | CO_NO_WIKI_SEARCH).get_drs() assert d is not None assert isinstance(d, DRS) d = d.show(SHOW_LINEAR).strip() except Exception as e: print(e) failed_ccg2drs.append((name, i, ccgbank)) raise continue with open( os.path.join(easysrl_path, idx, 'drs_%s_%04d.dat' % (idx, i)), 'w') as fd: fd.write(b'<sentence>\n') fd.write(safe_utf8_encode(s)) fd.write(b'\n</sentence>\n<drs>\n') fd.write(safe_utf8_encode(d)) fd.write(b'\n</drs>\n<predarg>\n') fd.write(safe_utf8_encode(pccg)) fd.write(b'\n') fd.write(b'</predarg>\n') if failed_parse != 0: print('%d derivations failed to parse' % failed_parse) if len(failed_ccg2drs) != 0: print('%d derivations failed to convert to DRS' % len(failed_ccg2drs)) for x in failed_ccg2drs: print('%s-%04d failed: {%s}' % x)
# Use CCGBANK as our corpus for fname in dirlist: print(fname) ldcpath1 = os.path.join(ldcpath, fname) m = idsrch.match(os.path.basename(ldcpath1)) if m is None: continue with open(ldcpath1, 'r') as fp: stats = spellchecker.build_from_corpus(fp, stats) # Iterate wordnet strm = StringIO.StringIO() for ss in wn.all_synsets(): ln = ' '.join(ss.lemma_names()) strm.write(safe_utf8_encode(ln)) strm.write(b'\n') strm.seek(0) stats = spellchecker.build_from_corpus(strm, stats) print("total words processed: %i" % stats[0]) print("total unique words in corpus: %i" % stats[1]) print("total items in dictionary (corpus words and deletions): %i" % len(spellchecker.dictionary)) print(" edit distance for deletions: %i" % spellchecker.max_edit_distance) print(" length of longest word in corpus: %i" % spellchecker.longest_word_length) with open(os.path.join(pypath, 'marbles', 'ie', 'kb', 'data', 'dictionary-en.dat'), 'w') as fp: spellchecker.save(fp)