def split(self, p_test=5, limit=0, min_quality=2, add_all=False): ts_all = {} ts_train = {} ts_test = {} cnt = 0 for cfn in self.ts: v = self.ts[cfn] cnt += 1 if limit > 0 and cnt > limit: break if v['quality'] < min_quality: if (v['quality'] != 0) or (not add_all): continue if len(v['ts']) == 0: if add_all: v['ts'] = ' '.join(tokenize(v['prompt'])) else: print "WARNING: %s transcript missing" % cfn continue ts_all[cfn] = v if len(ts_test) < (len(ts_all) * p_test / 100): ts_test[cfn] = v else: ts_train[cfn] = v return ts_all, ts_train, ts_test
def compute_x(self, txt): tokens = tokenize(txt) return map( lambda token: self.input_dict[token] if token in self.input_dict else UNK_ID, tokens)
def compute_input_dict(self): logging.info('Computing input dict...') self.dictionary = {'': 0} self.max_len = 0 self.num_segments = 0 for segments in (self.seg_test, self.seg_train): for segment in segments: tokens = tokenize(segment[0]) # print segment.txt, '->', repr(tokens) l = len(tokens) if l > self.max_len: self.max_len = l i = 0 for token in tokens: if not token in self.dictionary: self.dictionary[token] = len(self.dictionary) self.num_segments += 1 logging.info( 'input dict done. %d entries, max segment len is %d tokens.' % (len(self.dictionary), self.max_len)) return self.dictionary, self.max_len, self.num_segments
def compute_dicts(self): # build input and output dicts self.input_dict = { _PAD: PAD_ID, _GO: GO_ID, _EOS: EOS_ID, _UNK: UNK_ID } self.output_dict = { _PAD: PAD_ID, _GO: GO_ID, _EOS: EOS_ID, _UNK: UNK_ID } self.input_max_len = 0 self.output_max_len = 0 self.num_segments = 0 for dr in self.session.query(model.DiscourseRound).all(): # input tokens = tokenize(dr.inp) l = len(tokens) if l > self.input_max_len: self.input_max_len = l i = 0 for token in tokens: if not token in self.input_dict: self.input_dict[token] = len(self.input_dict) # output preds = dr.resp.split(';') l = len(preds) + 1 # +1 to account for _EOS token if l > self.output_max_len: self.output_max_len = l i = 0 for pred in preds: if not pred in self.output_dict: self.output_dict[pred] = len(self.output_dict) self.num_segments += 1 logging.info( 'dicts done. input: %d enties, input_max_len=%d. output: %d enties, input_max_len=%d. num_segments: %d' % (len(self.input_dict), self.input_max_len, len( self.output_dict), self.output_max_len, self.num_segments))
def apply_punkt(text): global tokenizer, outf sentncs = tokenizer.tokenize(text, realign_boundaries=True) for sentence in sentncs: print "Sentence: %s" % sentence outf.write(u'%s\n' % ' '.join(tokenize(sentence)))
def compute_input_hist(self): hist = {} for dr in self.session.query(model.DiscourseRound).all(): tokens = tokenize(dr.inp) if not (len(tokens) in hist): hist[len(tokens)] = 0 hist[len(tokens)] += 1 return hist
def compute_x(self, txt): x = np.zeros(self.max_len, np.int32) tokens = tokenize(txt) l = len(tokens) i = 0 for token in tokens: x[self.max_len - l + i] = self.dictionary[token] if token in self.dictionary else 0 i += 1 return x
def nlp_gen(src, clause): global nlp_macros, session args = clause.head.args lang = args[0].name # extract all macros used macro_names = set() argc = 1 while argc < len(args): nlp = args[argc ].s preds = args[argc+1].s argc += 2 for pos, char in enumerate(nlp): if char == '@': macro = re.match(r'@([A-Z]+):', nlp[pos:]) # print "MACRO:", macro.group(1) macro_names.add(macro.group(1)) # generate all macro-expansions macro_names = sorted(macro_names) todo = [ (0, {}) ] while True: if len(todo) == 0: break idx, mappings = todo.pop(0) if idx < len(macro_names): macro_name = macro_names[idx] for v in nlp_macros[macro_name]: nm = copy(mappings) # nm[macro_name] = (v, nlp_macros[macro_name][v]) nm[macro_name] = v todo.append ( (idx+1, nm) ) else: # generate discourse for this set of mappings # print repr(mappings) # create discourse in db discourse = model.Discourse(num_participants = 2, lang = lang, src = src) session.add(discourse) argc = 1 round_num = 0 while argc < len(args): s = args[argc ].s p = args[argc+1].s argc += 2 for k in mappings: for v in mappings[k]: s = s.replace('@'+k+':'+v, mappings[k][v]) p = p.replace('@'+k+':'+v, mappings[k][v]) inp_raw = utils.compress_ws(s.lstrip().rstrip()) p = utils.compress_ws(p.lstrip().rstrip()) # print s # print p # tokenize strings, wrap them into say() calls inp_tokenized = ' '.join(tokenize(inp_raw, lang)) preds = p.split(';') np = '' for pr in preds: if not pr.startswith('"'): if len(np)>0: np += ';' np += pr.strip() continue for word in tokenize (pr, lang): if len(np)>0: np += ';' np += 'say(' + lang + ', "' + word + '")' if len(p) > 2: if p[len(p)-2] in ['.', '?', '!']: if len(np)>0: np += ';' np += 'say(' + lang + ', "' + p[len(p)-2] + '")' np += ';eou' dr = model.DiscourseRound(inp_raw = inp_raw, inp_tokenized = inp_tokenized, response = np, discourse = discourse, round_num = round_num) session.add(dr) round_num += 1
def nlp_test(clause): global nlp_test_engine, db args = clause.head.args lang = args[0].name # extract test rounds, look up matching discourses rounds = [] # [ (in, out, actions), ...] round_num = 0 discourse_ids = set() for ivr in args[1:]: if ivr.name != 'ivr': raise PrologError ('nlp_test: ivr predicate args expected.') test_in = '' test_out = '' test_actions = set() for e in ivr.args: if e.name == 'in': test_in = ' '.join(tokenize(e.args[0].s, lang)) elif e.name == 'out': test_out = ' '.join(tokenize(e.args[0].s, lang)) elif e.name == 'action': test_actions.add(unicode(e)) else: raise PrologError (u'nlp_test: ivr predicate: unexpected arg: ' + unicode(e)) rounds.append((test_in, test_out, test_actions)) # look up matching discourse_ids: d_ids = set() for dr in session.query(model.DiscourseRound).filter(model.DiscourseRound.inp_tokenized==test_in) \ .filter(model.DiscourseRound.round_num==round_num).all(): d_ids.add(dr.discourse_id) if round_num==0: discourse_ids = d_ids else: discourse_ids = discourse_ids & d_ids print 'discourse_ids:', repr(discourse_ids) round_num += 1 if len(discourse_ids) == 0: raise PrologError ('nlp_test: no matching discourse found.') # run the test(s): look up reaction to input in db, execute it, check result for did in discourse_ids: nlp_test_engine.reset_context() round_num = 0 for dr in session.query(model.DiscourseRound).filter(model.DiscourseRound.discourse_id==did) \ .order_by(model.DiscourseRound.round_num): prolog_s = ','.join(dr.response.split(';')) print print "Round:", round_num, dr.inp_tokenized, '=>', prolog_s c = parser.parse_line_clause_body(prolog_s) # logging.debug( "Parse result: %s" % c) # logging.debug( "Searching for c: %s" % c ) nlp_test_engine.reset_utterances() solutions = nlp_test_engine.search(c) if len(solutions) == 0: raise PrologError ('nlp_test: no solution found.') print "round %d utterances: %s" % (round_num, repr(nlp_test_engine.get_utterances())) # check actual utterances vs expected one test_in, test_out, test_actions = rounds[round_num] found = False for utt in nlp_test_engine.get_utterances(): actual_out = ' '.join(tokenize(utt['utterance'], utt['lang'])) if actual_out == test_out: found = True break if found: print "***MATCHED!" else: raise PrologError ('nlp_test: actual utterance did not match.') # FIXME: check actions round_num += 1
print "looking for missing words..." missing = {} # word -> count num = len(transcripts) cnt = 0 for cfn in transcripts: ts = transcripts[cfn] cnt += 1 if ts['quality'] > 0: continue for word in tokenize(ts['prompt']): if word in lex: continue if word in missing: missing[word] += 1 else: missing[word] = 1 cnt = 0 for item in reversed(sorted(missing.items(), key=lambda x: x[1])): lex_base = item[0] ipas = sequitur_gen_ipa(lex_base)
def paint_main(stdscr, cur_ts): global edit_ts, prompt_tokens, prompt_token_idx ts = edit_ts[cur_ts] stdscr.clear() my, mx = stdscr.getmaxyx() for x in range(mx): stdscr.insstr(0, x, ' ', curses.A_REVERSE) stdscr.insstr(my - 2, x, ' ', curses.A_REVERSE) stdscr.insstr(my - 1, x, ' ', curses.A_REVERSE) # header s = u"%2d/%2d %-30s QTY: %d" % (cur_ts + 1, len(edit_ts), ts['cfn'], ts['quality']) stdscr.insstr(0, 0, s.encode('utf8'), curses.A_BOLD | curses.A_REVERSE) stdscr.insstr(0, mx - 13, 'Speech Editor', curses.A_REVERSE) # prompts file if prompt_token_idx < len(prompt_tokens): pstr = ' '.join(prompt_tokens[prompt_token_idx:prompt_token_idx + 8]) stdscr.insstr(1, mx - len(pstr), pstr.encode('utf8')) # body / transcript stdscr.insstr(2, 0, 'Prompt:', curses.A_BOLD) stdscr.insstr(3, 0, ts['prompt'].encode('utf8')) if len(ts['ts']) == 0: ts['ts'] = ' '.join(tokenize(ts['prompt'])) cy = 5 cx = 0 missing_token = None for token in ts['ts'].split(' '): if token in lex: s = '' m = lex.get_multi(token) for t in m: v = m[t] if len(s) > 0: s += ', ' if len(m) > 1 and t == token: s += '**' s += t s += ' [' + m[t]['ipa'] s += ']' stdscr.insstr(cy, cx, s.encode('utf8')) else: if not missing_token: missing_token = token stdscr.insstr(cy, cx, token.encode('utf8'), curses.A_REVERSE) cy += 1 if cy > my - 2: break # footer stdscr.insstr(my - 2, 0, " P:Play E:Prompt T:Transcript ", curses.A_REVERSE) stdscr.insstr(my - 1, 0, " L:LexEdit Prompts File: A=add S=skip B=Back ", curses.A_REVERSE) stdscr.insstr(my - 2, mx - 40, " Accept: 1=Poor 2=Fair 3=Good ", curses.A_REVERSE) stdscr.insstr(my - 1, mx - 40, " Q:Quit ", curses.A_REVERSE) stdscr.refresh() return missing_token
# print "loading lexicon..." lex = Lexicon() print "loading lexicon...done." # # load prompts # prompt_tokens = [] prompt_token_idx = 0 if options.promptsfn: with codecs.open(options.promptsfn, 'r', 'utf8') as promptsf: for line in promptsf: prompt_tokens.extend(tokenize(line)) print "%s read. %d tokens." % (options.promptsfn, len(prompt_tokens)) # # curses # locale.setlocale(locale.LC_ALL, "") stdscr = curses.initscr() curses.noecho() curses.cbreak() stdscr.keypad(1) #
# all_tokens = set() with codecs.open('%s/hal.txt' % WORKDIR, 'w', 'utf8') as allf: for tsfn in os.listdir('data/dst'): if not tsfn.endswith('.ts'): continue with codecs.open('data/dst/%s' % tsfn, 'r', 'utf8') as tsf: for line in tsf: tokens = tokenize(line) for token in tokens: all_tokens.add(token) allf.write (u'<s> %s </s>\n' % ' '.join(tokens)) # # wlist, dictionary # with codecs.open('%s/hal.vocab' % WORKDIR, 'w', 'utf8') as vocabf, \ codecs.open('%s/hal.dic' % WORKDIR, 'w', 'utf8') as dicf: vocabf.write('</s>\n') vocabf.write('<s>\n')
params = punkt_trainer.get_params() # print "Params: %s" % repr(params) tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer(params) with open(PUNKT_PICKLEFN, mode='wb') as f: pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL) print '%s written.' % PUNKT_PICKLEFN else: print "Loading %s ..." % PUNKT_PICKLEFN with open(PUNKT_PICKLEFN, mode='rb') as f: tokenizer = pickle.load(f) print "Loading %s ... done." % PUNKT_PICKLEFN with codecs.open(SENTENCEFN, 'w', 'utf8') as outf: print "applying punkt to parole..." parole_crawl(parole, apply_punkt) print "adding sentences from europarl..." with codecs.open(europarl, 'r', 'utf8') as inf: for line in inf: outf.write(u'%s\n' % ' '.join(tokenize(line))) print '%s written.' % SENTENCEFN print
def macro_expand(self, lang, nlps, preds): logging.debug ('macro_expand: nlps=%s, preds=%s' % (repr(nlps), repr(preds))) # handle implicit macros implicit_macros = {} nlps2 = [] for nlp in nlps: nlp2 = '' i = 0 while i<len(nlp): if nlp[i] == '(': j = nlp[i+1:].find(')') if j<0: raise Exception (') missing') j += i # extract macro macro_s = nlp[i+1:j+1] # print "macro_s: %s" % macro_s macro_name = '__INTERNAL_MACRO_%06d__' % len(implicit_macros) implicit_macros[macro_name] = [] for s in macro_s.split('|'): implicit_macros[macro_name].append({'w': s.strip()}) nlp2 += '@' + macro_name + ':w ' i = j+2 else: nlp2 += nlp[i] i+=1 nlps2.append(nlp2) # print "after implicit macro handling: %s" % nlp2 # print "implicit macros: %s" % repr(implicit_macros) # extract all macros used macro_names = set() for nlp in nlps2: # print nlp for pos, char in enumerate(nlp): if char == '@': macro = re.match(r'@([A-Z0-9_]+):', nlp[pos:]) # print "MACRO:", macro.group(1) macro_names.add(macro.group(1)) # print "macro names used: %s" % macro_names # generate all macro-expansions macro_names = sorted(macro_names) todo = [ (0, {}) ] discourses = [] while True: if len(todo) == 0: break idx, mappings = todo.pop(0) if idx < len(macro_names): macro_name = macro_names[idx] macro_dict = implicit_macros if macro_name in implicit_macros else self.named_macros for v in macro_dict[macro_name]: new_mappings = copy(mappings) new_mappings[macro_name] = v todo.append ( (idx+1, new_mappings) ) else: # generate discourse for this set of mappings # print 'mappings:', repr(mappings) discourse = [] argc = 1 for s, p in zip(nlps2, preds): # print s,p for k in mappings: for v in mappings[k]: s = s.replace('@'+k+':'+v, mappings[k][v]) p = p.replace('@'+k+':'+v, mappings[k][v]) inp_raw = utils.compress_ws(s.lstrip().rstrip()) p = utils.compress_ws(p.lstrip().rstrip()) # print s # print p # tokenize strings, wrap them into say() calls inp_tokenized = ' '.join(tokenize(inp_raw, lang)) ps = p.split(';') np = '' for pr in ps: if not pr.startswith('"'): if len(np)>0: np += ';' np += pr.strip() continue for word in tokenize (pr, lang): if len(np)>0: np += ';' np += 'say(' + lang + ', "' + word + '")' if len(p) > 2: if p[len(p)-2] in ['.', '?', '!']: if len(np)>0: np += ';' np += 'say(' + lang + ', "' + p[len(p)-2] + '")' np += ';eou' discourse.append((inp_tokenized, np)) logging.debug ('macro_expand: discourse : %s' % (repr(discourse))) discourses.append(discourse) return discourses
def nlp_test(self, clause): args = clause.head.args lang = args[0].name # extract test rounds, look up matching discourses rounds = [] # [ (in, out, actions), ...] round_num = 0 discourse_ids = set() for ivr in args[1:]: if ivr.name != 'ivr': raise PrologError('nlp_test: ivr predicate args expected.') test_in = '' test_out = '' test_actions = [] for e in ivr.args: if e.name == 'in': test_in = ' '.join(tokenize(e.args[0].s, lang)) elif e.name == 'out': test_out = ' '.join(tokenize(e.args[0].s, lang)) elif e.name == 'action': test_actions.append(e.args) else: raise PrologError( u'nlp_test: ivr predicate: unexpected arg: ' + unicode(e)) rounds.append((test_in, test_out, test_actions)) # look up matching discourse_ids: d_ids = set() for dr in self.session.query(model.DiscourseRound).filter(model.DiscourseRound.inp_tokenized==test_in) \ .filter(model.DiscourseRound.round_num==round_num).all(): d_ids.add(dr.discourse_id) if round_num == 0: discourse_ids = d_ids else: discourse_ids = discourse_ids & d_ids # print 'discourse_ids:', repr(discourse_ids) round_num += 1 if len(discourse_ids) == 0: raise PrologError('nlp_test: %s: no matching discourse found.' % clause.location) nlp_test_parser = PrologParser() # run the test(s): look up reaction to input in db, execute it, check result for did in discourse_ids: self.nlp_test_engine.reset_context() round_num = 0 for dr in self.session.query(model.DiscourseRound).filter(model.DiscourseRound.discourse_id==did) \ .order_by(model.DiscourseRound.round_num): prolog_s = ','.join(dr.resp.split(';')) logging.info( "nlp_test: %s round=%3d, %s => %s" % (clause.location, round_num, dr.inp_tokenized, prolog_s)) c = nlp_test_parser.parse_line_clause_body(prolog_s) # logging.debug( "Parse result: %s" % c) # logging.debug( "Searching for c: %s" % c ) self.nlp_test_engine.reset_utterances() self.nlp_test_engine.reset_actions() solutions = self.nlp_test_engine.search(c) if len(solutions) == 0: raise PrologError('nlp_test: %s no solution found.' % clause.location) # print "round %d utterances: %s" % (round_num, repr(nlp_test_engine.get_utterances())) # check actual utterances vs expected one test_in, test_out, test_actions = rounds[round_num] utterance_matched = False actual_out = '' utts = self.nlp_test_engine.get_utterances() if len(utts) > 0: for utt in utts: actual_out = ' '.join( tokenize(utt['utterance'], utt['lang'])) if actual_out == test_out: utterance_matched = True break else: utterance_matched = len(test_out) == 0 if utterance_matched: if len(utts) > 0: logging.info( "nlp_test: %s round=%3d *** UTTERANCE MATCHED!" % (clause.location, round_num)) else: raise PrologError( u'nlp_test: %s round=%3d actual utterance \'%s\' did not match expected utterance \'%s\'.' % (clause.location, round_num, actual_out, test_out)) # check actions if len(test_actions) > 0: # print repr(test_actions) actions_matched = True acts = self.nlp_test_engine.get_actions() for action in test_actions: for act in acts: # print " check action match: %s vs %s" % (repr(action), repr(act)) if action == act: break if action != act: actions_matched = False break if actions_matched: logging.info( "nlp_test: %s round=%3d *** ACTIONS MATCHED!" % (clause.location, round_num)) else: raise PrologError( u'nlp_test: %s round=%3d ACTIONS MISMATCH.' % (clause.location, round_num)) round_num += 1