def compile_module(self, module_name): m = self.modules[module_name] # clear module, delete old NLP training data self.db.clear_module(module_name, commit=True) self.session.query(model.TrainingData).filter( model.TrainingData.module == module_name).delete() self.session.query(model.TestCase).filter( model.TestCase.module == module_name).delete() self.session.query(model.NERData).filter( model.NERData.module == module_name).delete() # extract new training data for this module train_ds = [] tests = [] ner = {} if hasattr(m, 'nlp_train'): # training_data_cnt = 0 logging.info('module %s python training data extraction...' % module_name) nlp_train = getattr(m, 'nlp_train') train_ds.extend(nlp_train(self)) if hasattr(m, 'nlp_test'): logging.info('module %s python test case extraction...' % module_name) nlp_test = getattr(m, 'nlp_test') nlp_tests = nlp_test(self) tests.extend(nlp_tests) if hasattr(m, 'AIP_SOURCES'): logging.info('module %s AIP training data extraction...' % module_name) for inputfn in m.AIP_SOURCES: ds, ts, ne = self.aip_parser.compile_file( 'modules/%s/%s' % (module_name, inputfn), module_name) train_ds.extend(ds) tests.extend(ts) for lang in ne: if not lang in ner: ner[lang] = {} for cls in ne[lang]: if not cls in ner[lang]: ner[lang][cls] = {} for entity in ne[lang][cls]: ner[lang][cls][entity] = ne[lang][cls][entity] logging.info( 'module %s training data extraction done. %d training samples, %d tests' % (module_name, len(train_ds), len(tests))) # put training data into our DB td_set = set() td_list = [] for utt_lang, contexts, i, resp, loc_fn, loc_line, loc_col, prio in train_ds: inp = copy(contexts) inp.extend(i) inp_json = json.dumps(inp) resp_json = json.dumps(resp) # utterance = u' '.join(map(lambda c: text_type(c), contexts)) # if utterance: # utterance += u' ' # utterance += u' '.join(i) utterance = u' '.join(i) k = utt_lang + '#0#' + '#' + inp_json + '#' + resp_json if not k in td_set: td_set.add(k) td_list.append( model.TrainingData( lang=utt_lang, module=module_name, utterance=utterance, inp=inp_json, resp=resp_json, prio=prio, loc_fn=loc_fn, loc_line=loc_line, loc_col=loc_col, )) logging.info( 'module %s training data conversion done. %d unique training samples.' % (module_name, len(td_list))) start_time = time.time() logging.info(u'bulk saving to db...') self.session.bulk_save_objects(td_list) self.session.commit() logging.info(u'bulk saving to db... done. Took %fs.' % (time.time() - start_time)) # put test data into our DB td_list = [] for name, lang, prep, rounds, loc_fn, loc_line, loc_col in tests: prep_json = prolog_to_json(prep) rounds_json = json.dumps(rounds) td_list.append( model.TestCase(lang=lang, module=module_name, name=name, prep=prep_json, rounds=rounds_json, loc_fn=loc_fn, loc_line=loc_line, loc_col=loc_col)) logging.info('module %s test data conversion done. %d tests.' % (module_name, len(td_list))) start_time = time.time() logging.info(u'bulk saving to db...') self.session.bulk_save_objects(td_list) self.session.commit() logging.info(u'bulk saving to db... done. Took %fs.' % (time.time() - start_time)) # put NER data into our DB # import pdb; pdb.set_trace() ner_list = [] for lang in ner: for cls in ner[lang]: for entity in ner[lang][cls]: ner_list.append( model.NERData(lang=lang, module=module_name, cls=cls, entity=entity, label=ner[lang][cls][entity])) logging.info('module %s NER data conversion done. %d rows.' % (module_name, len(ner_list))) start_time = time.time() logging.info(u'bulk saving to db...') self.session.bulk_save_objects(ner_list) self.session.commit() logging.info(u'bulk saving to db... done. Took %fs.' % (time.time() - start_time)) self.session.commit()
def ts(self, lang, test_name, rounds, prep=None): # import pdb; pdb.set_trace() # caller's source location: curframe = inspect.currentframe() calframe = inspect.getouterframes(curframe, 2) self.src_location = (calframe[1][1], calframe[1][2]) # normalize rounds by tokenizing inp/resp rs = [] for r in rounds: md5s = None arg = None if len(r) > 2: code = r[2] if code: code_src = inspect.getsource(code) code_src = self._unindent(code_src) code_ast = ast.parse(code_src) for node in ast.walk(code_ast): if isinstance(node, ast.FunctionDef): code_fn = node.name code_src = codegen.to_source(node) md5s = self.store_code(code_src, code_fn) break if len(r) > 3: arg = r[3] rs.append((u' '.join(tokenize(r[0], lang=lang)), u' '.join(tokenize(r[1], lang=lang)), md5s, arg)) # extract prep code, if any if prep: src_txt = inspect.getsource(prep) src_txt = self._unindent(src_txt) src_ast = ast.parse(src_txt) code_ast = None for node in ast.walk(src_ast): if isinstance(node, ast.FunctionDef): prep_ast = node prep_fn = node.name break if prep_ast: prep_code = codegen.to_source(prep_ast) else: prep_code = None prep_fn = None tc = model.TestCase(lang=lang, module=self.data_module_name, name=test_name, prep_code=prep_code, prep_fn=prep_fn, rounds=json.dumps(rs), loc_fn=self.src_location[0], loc_line=self.src_location[1]) self.session.add(tc) self.cnt_ts += 1