Пример #1
0
    def compile_module(self, module_name):

        m = self.modules[module_name]

        # clear module, delete old NLP training data

        self.db.clear_module(module_name, commit=True)
        self.session.query(model.TrainingData).filter(
            model.TrainingData.module == module_name).delete()
        self.session.query(model.TestCase).filter(
            model.TestCase.module == module_name).delete()
        self.session.query(model.NERData).filter(
            model.NERData.module == module_name).delete()

        # extract new training data for this module

        train_ds = []
        tests = []
        ner = {}

        if hasattr(m, 'nlp_train'):

            # training_data_cnt = 0

            logging.info('module %s python training data extraction...' %
                         module_name)

            nlp_train = getattr(m, 'nlp_train')
            train_ds.extend(nlp_train(self))

        if hasattr(m, 'nlp_test'):

            logging.info('module %s python test case extraction...' %
                         module_name)

            nlp_test = getattr(m, 'nlp_test')
            nlp_tests = nlp_test(self)
            tests.extend(nlp_tests)

        if hasattr(m, 'AIP_SOURCES'):

            logging.info('module %s AIP training data extraction...' %
                         module_name)

            for inputfn in m.AIP_SOURCES:
                ds, ts, ne = self.aip_parser.compile_file(
                    'modules/%s/%s' % (module_name, inputfn), module_name)

                train_ds.extend(ds)
                tests.extend(ts)

                for lang in ne:
                    if not lang in ner:
                        ner[lang] = {}
                    for cls in ne[lang]:
                        if not cls in ner[lang]:
                            ner[lang][cls] = {}
                        for entity in ne[lang][cls]:
                            ner[lang][cls][entity] = ne[lang][cls][entity]

        logging.info(
            'module %s training data extraction done. %d training samples, %d tests'
            % (module_name, len(train_ds), len(tests)))

        # put training data into our DB

        td_set = set()
        td_list = []

        for utt_lang, contexts, i, resp, loc_fn, loc_line, loc_col, prio in train_ds:

            inp = copy(contexts)
            inp.extend(i)

            inp_json = json.dumps(inp)
            resp_json = json.dumps(resp)

            # utterance = u' '.join(map(lambda c: text_type(c), contexts))
            # if utterance:
            #     utterance += u' '
            # utterance += u' '.join(i)
            utterance = u' '.join(i)

            k = utt_lang + '#0#' + '#' + inp_json + '#' + resp_json
            if not k in td_set:
                td_set.add(k)
                td_list.append(
                    model.TrainingData(
                        lang=utt_lang,
                        module=module_name,
                        utterance=utterance,
                        inp=inp_json,
                        resp=resp_json,
                        prio=prio,
                        loc_fn=loc_fn,
                        loc_line=loc_line,
                        loc_col=loc_col,
                    ))

        logging.info(
            'module %s training data conversion done. %d unique training samples.'
            % (module_name, len(td_list)))

        start_time = time.time()
        logging.info(u'bulk saving to db...')
        self.session.bulk_save_objects(td_list)
        self.session.commit()
        logging.info(u'bulk saving to db... done. Took %fs.' %
                     (time.time() - start_time))

        # put test data into our DB

        td_list = []

        for name, lang, prep, rounds, loc_fn, loc_line, loc_col in tests:

            prep_json = prolog_to_json(prep)
            rounds_json = json.dumps(rounds)

            td_list.append(
                model.TestCase(lang=lang,
                               module=module_name,
                               name=name,
                               prep=prep_json,
                               rounds=rounds_json,
                               loc_fn=loc_fn,
                               loc_line=loc_line,
                               loc_col=loc_col))

        logging.info('module %s test data conversion done. %d tests.' %
                     (module_name, len(td_list)))

        start_time = time.time()
        logging.info(u'bulk saving to db...')
        self.session.bulk_save_objects(td_list)
        self.session.commit()
        logging.info(u'bulk saving to db... done. Took %fs.' %
                     (time.time() - start_time))

        # put NER data into our DB

        # import pdb; pdb.set_trace()

        ner_list = []

        for lang in ner:
            for cls in ner[lang]:
                for entity in ner[lang][cls]:
                    ner_list.append(
                        model.NERData(lang=lang,
                                      module=module_name,
                                      cls=cls,
                                      entity=entity,
                                      label=ner[lang][cls][entity]))

        logging.info('module %s NER data conversion done. %d rows.' %
                     (module_name, len(ner_list)))

        start_time = time.time()
        logging.info(u'bulk saving to db...')
        self.session.bulk_save_objects(ner_list)
        self.session.commit()
        logging.info(u'bulk saving to db... done. Took %fs.' %
                     (time.time() - start_time))

        self.session.commit()
Пример #2
0
    def ts(self, lang, test_name, rounds, prep=None):

        # import pdb; pdb.set_trace()

        # caller's source location:

        curframe = inspect.currentframe()
        calframe = inspect.getouterframes(curframe, 2)

        self.src_location = (calframe[1][1], calframe[1][2])

        # normalize rounds by tokenizing inp/resp
        rs = []
        for r in rounds:

            md5s = None
            arg = None
            if len(r) > 2:
                code = r[2]
                if code:
                    code_src = inspect.getsource(code)
                    code_src = self._unindent(code_src)
                    code_ast = ast.parse(code_src)

                    for node in ast.walk(code_ast):
                        if isinstance(node, ast.FunctionDef):
                            code_fn = node.name
                            code_src = codegen.to_source(node)
                            md5s = self.store_code(code_src, code_fn)
                            break
                if len(r) > 3:
                    arg = r[3]

            rs.append((u' '.join(tokenize(r[0], lang=lang)),
                       u' '.join(tokenize(r[1], lang=lang)), md5s, arg))

        # extract prep code, if any

        if prep:
            src_txt = inspect.getsource(prep)

            src_txt = self._unindent(src_txt)

            src_ast = ast.parse(src_txt)

            code_ast = None

            for node in ast.walk(src_ast):
                if isinstance(node, ast.FunctionDef):
                    prep_ast = node
                    prep_fn = node.name
                    break

            if prep_ast:
                prep_code = codegen.to_source(prep_ast)

        else:
            prep_code = None
            prep_fn = None

        tc = model.TestCase(lang=lang,
                            module=self.data_module_name,
                            name=test_name,
                            prep_code=prep_code,
                            prep_fn=prep_fn,
                            rounds=json.dumps(rs),
                            loc_fn=self.src_location[0],
                            loc_line=self.src_location[1])
        self.session.add(tc)

        self.cnt_ts += 1