def _readMaps(cls, fsm_fn, encoding='utf-8'):
        imap_fn = os.path.splitext(fsm_fn)[0] + '.isym'
        imap = SymMap.readFromFile(imap_fn, encoding=encoding)

        omap_fn = os.path.splitext(fsm_fn)[0] + '.osym'
        omap = SymMap.readFromFile(omap_fn, encoding=encoding)
        return imap, omap
    def _readMaps(cls, fsm_fn, encoding='utf-8'):
        imap_fn = os.path.splitext(fsm_fn)[0]+'.isym'
        imap = SymMap.readFromFile(imap_fn, encoding=encoding)

        omap_fn = os.path.splitext(fsm_fn)[0]+'.osym'
        omap = SymMap.readFromFile(omap_fn, encoding=encoding)
        return imap, omap
    def main(self, concept_map, sym_map, examples, output, threshold):
        self.conceptMap = SymMap.readFromFile(concept_map, format=(int, unicode)).inverse
        self.symMap = SymMap.readFromFile(sym_map, format=(int, unicode)).inverse

        examples = ADict.readFromFile(examples)
        examples = self.mapExamples(examples, threshold)

        key = lambda (k, v): (k[0], -v, k[1])
        examples.writeToFile(output, key=key)
    def main(self, concept_map, sym_map, examples, output, threshold):
        self.conceptMap = SymMap.readFromFile(concept_map,
                                              format=(int, unicode)).inverse
        self.symMap = SymMap.readFromFile(sym_map,
                                          format=(int, unicode)).inverse

        examples = ADict.readFromFile(examples)
        examples = self.mapExamples(examples, threshold)

        key = lambda (k, v): (k[0], -v, k[1])
        examples.writeToFile(output, key=key)
Пример #5
0
 def loadMaps(self, maps):
     ret = []
     for m in maps:
         fn = pjoin(self.fsm_dir, m)
         ret.append( SymMap.readFromFile(fn) )
     return ret
    def main(
        self,
        model_dir,
        encoding=None,
        batch=False,
        omit_leaves=False,
        mlf=False,
        xml_dir=None,
        ref_mlf=None,
        skip_empty=False,
        input_chain=None,
        batch_size=100,
        no_underscores=True,
        force_pdt=False,
        pdt_dir=None,
    ):
        encoding = sys.stdout.encoding
        if encoding is None:
            if os.name == "nt":
                encoding = "cp1250"
            else:
                encoding = "iso-8859-2"

        datasets_fn = pjoin(model_dir, "datasets")
        datasets_fr = file(datasets_fn, "r")
        datasets = []
        isymMaps = []
        for i, line in enumerate(datasets_fr):
            line = line.strip()
            datasets.append(line)
            if line != "off":
                isymMaps.append(SymMap.readFromFile(pjoin(model_dir, "isym%d.map" % (i + 1,))))

        osymMap = SymMap.readFromFile(pjoin(model_dir, "osym.map"))

        if "signed" in datasets:
            da_type = "signed"
        else:
            da_type = "normalized"

        if not pdt_dir:
            pdt_dir = "/opt/PDT-2.0/tools/machine-annotation"

        if xml_dir:
            reader = input.MultiReader([xml_dir], input.DXMLReader)
            if force_pdt and "lemma" in datasets or "pos" in datasets:
                if os.name == "nt":
                    raise RuntimeError("Datasets 'lemma' and 'pos' are unsupported on Windows")
                reader = input.PDTReader(pdt_dir, reader, online=not batch)
        else:
            reader = input.StdInReader(encoding=encoding, type=da_type)
            if "lemma" in datasets or "pos" in datasets:
                if os.name == "nt":
                    raise RuntimeError("Datasets 'lemma' and 'pos' are unsupported on Windows")
                reader = input.PDTReader(pdt_dir, reader, online=not batch)
        if input_chain is not None:
            reader = input.InputChain(input_chain, reader)
        generator = input.InputGenerator(reader, datasets, datasets[0], noUnderscores=no_underscores)
        hypMLF = MLF()
        refMLF = MLF()
        if not batch:
            for da_fn, da_id, da_semantics, da_txts in generator.readInputs():
                da_empty = not bool(da_semantics.strip())
                if da_empty and skip_empty:
                    continue

                refMLF[da_id] = da_semantics + "\n"
                dcd = self.parseLine(model_dir, [da_txts], isymMaps, osymMap, omitLeaves=omit_leaves)
                if dcd:
                    if len(dcd) == 1:
                        hypMLF[da_id] = dcd[0].encode(encoding) + "\n"
                    else:
                        hypMLF[da_id] = ";".join(dcd).encode(encoding) + "\n"
                else:
                    hypMLF[da_id] = line + "\n"
                if not mlf:
                    print hypMLF[da_id],
        else:
            all_processed = False
            inputs = generator.readInputs()
            while not all_processed:
                da_count = 0
                lines = []
                ids = []
                for da_fn, da_id, da_semantics, da_txts in inputs:
                    da_empty = not bool(da_semantics.strip())
                    if da_empty and skip_empty:
                        continue

                    refMLF[da_id] = da_semantics + "\n"
                    lines.append(da_txts)
                    ids.append(da_id)
                    da_count += 1
                    if da_count >= batch_size:
                        break
                else:
                    all_processed = True

                dcd = self.parseLine(model_dir, lines, isymMaps, osymMap, omitLeaves=omit_leaves)
                for da_id, ol in zip(ids, dcd):
                    hypMLF[da_id] = ol.encode(encoding) + "\n"
                    if not mlf:
                        print hypMLF[da_id],
        if mlf:
            s = "".join(hypMLF.toLines())
            print s

        if ref_mlf:
            refMLF.writeToFile(ref_mlf)
    def makeFMStxt(self, separ, dataLm):
        isym_map = None
        isym_fn = os.path.join(dataLm, 'dacoder.fsm.isym')
        fsm = os.path.join(dataLm, 'dacoder.fsm.txt')
        fsm_fw = file(fsm, 'w')
        add = 1
        da_map = SymMap.readFromFile(os.path.join(dataLm, 'dialogue_act.fsm.isym'))
        for da in separ:
            fn = self.mapTXT(dataLm, da)
            fn_lm = self.mapLM(dataLm, da)
            fn_fsm = self.mapFSM(dataLm, da)

            da_num_op = da_map['operator_'+da]
            da_num_us = da_map['user_'+da]

            if isym_map is None:
                self.convertLMtoFSM(fn_lm, isym_fn)
                isym_map = SymMap.readFromFile(isym_fn)
                _empty_ = isym_map.add('_empty_')
                _operator_ = isym_map.add('_operator_')
                _user_ = isym_map.add('_user_')
                for i in separ:
                    isym_map.add('user_%s' % i)
                    isym_map.add('operator_%s' % i)
                isym_map.writeToFile(isym_fn)

            s0 = None
            states = set()
            for line in self.convertLMtoFSM(fn_lm):
                # GAWK hack
                line = line.replace(',', '.')
                splitted = line.split()

                if s0 is None:
                    s0 = int(splitted[0])+add
                    print >> fsm_fw, '0\t%d\t%d\t%d\t0' % (s0, _operator_, da_num_op, )
                    print >> fsm_fw, '0\t%d\t%d\t%d\t0' % (s0, _user_, da_num_us, )

                if len(splitted) in (1, 2):
                    state_no = int(splitted[0])
                    if len(splitted) == 2:
                        weight = float(splitted[1])
                    else:
                        weight = 0.
                    print >> fsm_fw, '%d\t0\t%d\t0\t%e' % (state_no + add, _empty_, weight)
                    states.add(state_no)
                elif len(splitted) in (3, 4):
                    state_no_1 = int(splitted[0])
                    state_no_2 = int(splitted[1])
                    isym = int(splitted[2])
                    if len(splitted) == 4:
                        weight = float(splitted[3])
                    else:
                        weight = 0.
                    print >> fsm_fw, '%d\t%d\t%d\t0\t%e' % (state_no_1+add, state_no_2+add, isym, weight)
                    states.add(state_no_1)
                    states.add(state_no_2)
                else:
                    raise ValueError("Unknown FSM line: %r" % line)
            add += max(states)+1
        for i in separ:
            for j in ['user', 'operator']:
                da = '%s_%s' % (j, i)
                isym = isym_map[da]
                osym = da_map[da]
                print >> fsm_fw, '0\t0\t%d\t%d\t0' % (isym, osym)
        print >> fsm_fw, '0'
        fsm_fw.close()
        da_map.writeToFile(os.path.join(dataLm, 'dacoder.fsm.osym'))
        FSMCompile('-t', fsm, '-F', os.path.join(dataLm, 'dacoder.fsm'))
    def fsmconvert(self, pteMapFn=None):
        sys.path.append('src')
        import fsm
        from svc.ui import gmtk

        max_states = int(self.settings['FSM_STATES'])
        cutoff_sym = float(self.settings['FSM_CUTOFF_SYM'])
        cutoff_trans = float(self.settings['FSM_CUTOFF_TRANS'])

        self.setCommonParams()
        FSM_DIR = self.settings['FSM_DIR']
        mkdirp(FSM_DIR)

        conceptMapFn = self.settings['CONCEPT_MAP']
        self.logger.debug("Reading concept map: %s", conceptMapFn)
        conceptMap = SymMap.readFromFile(conceptMapFn,
                                         format=(int, unicode)).inverse
        del conceptMap['_SINK_']
        #conceptMap = SymMap((k, v) for (k, v) in conceptMap.iteritems() if k in '_EMPTY_ GREETING DEPARTURE'.split())
        #conceptMap = SymMap((k, v) for (k, v) in conceptMap.iteritems() if k in '_EMPTY_ GREETING'.split())

        dataset_fn = os.path.join(FSM_DIR, 'datasets')
        dataset_fw = file(dataset_fn, 'w')
        sMaps = []
        for ds in [1, 2, 3]:
            ds_value = self.settings['S%d_DATASET' % ds]
            if ds_value != 'off':
                mapFn = self.settings['S%d_MAP' % ds]
                self.logger.debug("Reading s%d map: %s", ds, mapFn)
                map = SymMap.readFromFile(mapFn, format=(int, unicode)).inverse
                #map = SymMap((k, v) for (k, v) in map.iteritems() if k in u'dobrý den kdy jede _empty_ _unseen_'.split())
                sMaps.append(map)
            else:
                self.logger.debug("Dataset s%d is turned off", ds)
                sMaps.append(None)
            dataset_fw.write(ds_value + '\n')
        dataset_fw.close()

        if pteMapFn is not None:
            self.logger.debug("Reading pte map: %s", pteMapFn)
            pteMap = SymMap.readFromFile(pteMapFn, format=(unicode, int))
        else:
            pteMap = {}
        pteSymbols = pteMap.keys()

        mstr = os.path.join(self.settings['MSTR_DCD_DIR'], 'in.mstr')
        cppOptions = self.settings['CPP_OPTIONS'].split()
        workspace = gmtk.Workspace(cppOptions=cppOptions, readDTS=False)
        self.logger.info('Reading master file: %s', mstr)
        workspace.readMasterFile(mstr)

        self.logger.info('Creating FSM from arcs')

        self.logger.info('Total number of concepts: %d', len(conceptMap))
        #self.logger.info('Total number of symbols: %d', len(s1Map))

        stateGenerator = fsm.FSMGenerator(workspace,
                                          conceptMap,
                                          sMaps,
                                          cutoff_sym,
                                          cutoff_trans,
                                          max_states,
                                          pteSymbols=pteSymbols,
                                          logger=self.logger)
        stateGenerator.writeFSMRepeater(
            os.path.join(FSM_DIR, 'hvsrepeater.txt'))
        stateGenerator.writeFSMPadder(os.path.join(FSM_DIR, 'hvspadder.txt'))
        stateGenerator.writeFSM(os.path.join(FSM_DIR, 'hvsparser_pad.txt'))

        stateGenerator.stateMap.writeToFile(os.path.join(FSM_DIR, 'state.map'))
        stateGenerator.osymMap.writeToFile(os.path.join(FSM_DIR, 'osym.map'))
        for i, map in enumerate(stateGenerator.isymMaps):
            map.writeToFile(os.path.join(FSM_DIR, 'isym%d.map' % (i + 1, )))
        stateGenerator.ipteMap.writeToFile(os.path.join(FSM_DIR, 'pte.map'))

        self.fsmcompile()
 def setupMaps(self, conceptMap, symMap):
     self.conceptMap = SymMap.readFromFile(conceptMap, format=(int, unicode)).inverse
     self.symMap = SymMap.readFromFile(symMap, format=(int, unicode)).inverse
    def main(self,
             model_dir,
             encoding=None,
             batch=False,
             omit_leaves=False,
             mlf=False,
             xml_dir=None,
             ref_mlf=None,
             skip_empty=False,
             input_chain=None,
             batch_size=100,
             no_underscores=True,
             force_pdt=False,
             pdt_dir=None):
        encoding = sys.stdout.encoding
        if encoding is None:
            if os.name == 'nt':
                encoding = 'cp1250'
            else:
                encoding = 'iso-8859-2'

        datasets_fn = pjoin(model_dir, 'datasets')
        datasets_fr = file(datasets_fn, 'r')
        datasets = []
        isymMaps = []
        for i, line in enumerate(datasets_fr):
            line = line.strip()
            datasets.append(line)
            if line != 'off':
                isymMaps.append(
                    SymMap.readFromFile(
                        pjoin(model_dir, 'isym%d.map' % (i + 1, ))))

        osymMap = SymMap.readFromFile(pjoin(model_dir, 'osym.map'))

        if 'signed' in datasets:
            da_type = 'signed'
        else:
            da_type = 'normalized'

        if not pdt_dir:
            pdt_dir = '/opt/PDT-2.0/tools/machine-annotation'

        if xml_dir:
            reader = input.MultiReader([xml_dir], input.DXMLReader)
            if force_pdt and 'lemma' in datasets or 'pos' in datasets:
                if os.name == 'nt':
                    raise RuntimeError(
                        "Datasets 'lemma' and 'pos' are unsupported on Windows"
                    )
                reader = input.PDTReader(pdt_dir, reader, online=not batch)
        else:
            reader = input.StdInReader(encoding=encoding, type=da_type)
            if 'lemma' in datasets or 'pos' in datasets:
                if os.name == 'nt':
                    raise RuntimeError(
                        "Datasets 'lemma' and 'pos' are unsupported on Windows"
                    )
                reader = input.PDTReader(pdt_dir, reader, online=not batch)
        if input_chain is not None:
            reader = input.InputChain(input_chain, reader)
        generator = input.InputGenerator(reader,
                                         datasets,
                                         datasets[0],
                                         noUnderscores=no_underscores)
        hypMLF = MLF()
        refMLF = MLF()
        if not batch:
            for da_fn, da_id, da_semantics, da_txts in generator.readInputs():
                da_empty = not bool(da_semantics.strip())
                if (da_empty and skip_empty):
                    continue

                refMLF[da_id] = da_semantics + '\n'
                dcd = self.parseLine(model_dir, [da_txts],
                                     isymMaps,
                                     osymMap,
                                     omitLeaves=omit_leaves)
                if dcd:
                    if len(dcd) == 1:
                        hypMLF[da_id] = dcd[0].encode(encoding) + '\n'
                    else:
                        hypMLF[da_id] = ';'.join(dcd).encode(encoding) + '\n'
                else:
                    hypMLF[da_id] = line + '\n'
                if not mlf:
                    print hypMLF[da_id],
        else:
            all_processed = False
            inputs = generator.readInputs()
            while not all_processed:
                da_count = 0
                lines = []
                ids = []
                for da_fn, da_id, da_semantics, da_txts in inputs:
                    da_empty = not bool(da_semantics.strip())
                    if (da_empty and skip_empty):
                        continue

                    refMLF[da_id] = da_semantics + '\n'
                    lines.append(da_txts)
                    ids.append(da_id)
                    da_count += 1
                    if da_count >= batch_size:
                        break
                else:
                    all_processed = True

                dcd = self.parseLine(model_dir,
                                     lines,
                                     isymMaps,
                                     osymMap,
                                     omitLeaves=omit_leaves)
                for da_id, ol in zip(ids, dcd):
                    hypMLF[da_id] = ol.encode(encoding) + '\n'
                    if not mlf:
                        print hypMLF[da_id],
        if mlf:
            s = ''.join(hypMLF.toLines())
            print s

        if ref_mlf:
            refMLF.writeToFile(ref_mlf)
 def _readMaps(cls, fsm_fn, encoding='utf-8'):
     map_fn = os.path.splitext(fsm_fn)[0] + '.isym'
     map = SymMap.readFromFile(map_fn, encoding=encoding)
     return map
    def fsmconvert(self, pteMapFn=None):
        sys.path.append('src')
        import fsm
        from svc.ui import gmtk

        max_states = int(self.settings['FSM_STATES'])
        cutoff_sym = float(self.settings['FSM_CUTOFF_SYM'])
        cutoff_trans = float(self.settings['FSM_CUTOFF_TRANS'])

        self.setCommonParams()
        FSM_DIR = self.settings['FSM_DIR']
        mkdirp(FSM_DIR)

        conceptMapFn = self.settings['CONCEPT_MAP']
        self.logger.debug("Reading concept map: %s", conceptMapFn)
        conceptMap = SymMap.readFromFile(conceptMapFn, format=(int, unicode)).inverse
        del conceptMap['_SINK_']
        #conceptMap = SymMap((k, v) for (k, v) in conceptMap.iteritems() if k in '_EMPTY_ GREETING DEPARTURE'.split())
        #conceptMap = SymMap((k, v) for (k, v) in conceptMap.iteritems() if k in '_EMPTY_ GREETING'.split())

        dataset_fn = os.path.join(FSM_DIR, 'datasets')
        dataset_fw = file(dataset_fn, 'w')
        sMaps = []
        for ds in [1, 2, 3]:
            ds_value = self.settings['S%d_DATASET' % ds]
            if ds_value != 'off':
                mapFn = self.settings['S%d_MAP'% ds]
                self.logger.debug("Reading s%d map: %s", ds, mapFn)
                map = SymMap.readFromFile(mapFn, format=(int, unicode)).inverse
                #map = SymMap((k, v) for (k, v) in map.iteritems() if k in u'dobrý den kdy jede _empty_ _unseen_'.split())
                sMaps.append(map)
            else:
                self.logger.debug("Dataset s%d is turned off", ds)
                sMaps.append(None)
            dataset_fw.write(ds_value + '\n')
        dataset_fw.close()

        if pteMapFn is not None:
            self.logger.debug("Reading pte map: %s", pteMapFn)
            pteMap = SymMap.readFromFile(pteMapFn, format=(unicode, int))
        else:
            pteMap = {}
        pteSymbols = pteMap.keys()
        
        mstr = os.path.join(self.settings['MSTR_DCD_DIR'], 'in.mstr')
        cppOptions = self.settings['CPP_OPTIONS'].split()
        workspace = gmtk.Workspace(cppOptions=cppOptions, readDTS=False)
        self.logger.info('Reading master file: %s', mstr)
        workspace.readMasterFile(mstr)

        self.logger.info('Creating FSM from arcs')

        self.logger.info('Total number of concepts: %d', len(conceptMap))
        #self.logger.info('Total number of symbols: %d', len(s1Map))

        stateGenerator = fsm.FSMGenerator(workspace, conceptMap, sMaps,
                                    cutoff_sym, cutoff_trans, max_states,
                                    pteSymbols=pteSymbols,
                                    logger=self.logger)
        stateGenerator.writeFSMRepeater(os.path.join(FSM_DIR, 'hvsrepeater.txt'))
        stateGenerator.writeFSMPadder(os.path.join(FSM_DIR, 'hvspadder.txt'))
        stateGenerator.writeFSM(os.path.join(FSM_DIR, 'hvsparser_pad.txt'))

        stateGenerator.stateMap.writeToFile(os.path.join(FSM_DIR, 'state.map'))
        stateGenerator.osymMap.writeToFile(os.path.join(FSM_DIR, 'osym.map'))
        for i, map in enumerate(stateGenerator.isymMaps):
            map.writeToFile(os.path.join(FSM_DIR, 'isym%d.map' % (i+1, )))
        stateGenerator.ipteMap.writeToFile(os.path.join(FSM_DIR, 'pte.map'))

        self.fsmcompile()
 def setupMaps(self, conceptMap, symMap):
     self.conceptMap = SymMap.readFromFile(conceptMap,
                                           format=(int, unicode)).inverse
     self.symMap = SymMap.readFromFile(symMap,
                                       format=(int, unicode)).inverse
    def makeFMStxt(self, separ, dataLm):
        isym_map = None
        isym_fn = os.path.join(dataLm, 'dacoder.fsm.isym')
        fsm = os.path.join(dataLm, 'dacoder.fsm.txt')
        fsm_fw = file(fsm, 'w')
        add = 1
        da_map = SymMap.readFromFile(
            os.path.join(dataLm, 'dialogue_act.fsm.isym'))
        for da in separ:
            fn = self.mapTXT(dataLm, da)
            fn_lm = self.mapLM(dataLm, da)
            fn_fsm = self.mapFSM(dataLm, da)

            da_num_op = da_map['operator_' + da]
            da_num_us = da_map['user_' + da]

            if isym_map is None:
                self.convertLMtoFSM(fn_lm, isym_fn)
                isym_map = SymMap.readFromFile(isym_fn)
                _empty_ = isym_map.add('_empty_')
                _operator_ = isym_map.add('_operator_')
                _user_ = isym_map.add('_user_')
                for i in separ:
                    isym_map.add('user_%s' % i)
                    isym_map.add('operator_%s' % i)
                isym_map.writeToFile(isym_fn)

            s0 = None
            states = set()
            for line in self.convertLMtoFSM(fn_lm):
                # GAWK hack
                line = line.replace(',', '.')
                splitted = line.split()

                if s0 is None:
                    s0 = int(splitted[0]) + add
                    print >> fsm_fw, '0\t%d\t%d\t%d\t0' % (
                        s0,
                        _operator_,
                        da_num_op,
                    )
                    print >> fsm_fw, '0\t%d\t%d\t%d\t0' % (
                        s0,
                        _user_,
                        da_num_us,
                    )

                if len(splitted) in (1, 2):
                    state_no = int(splitted[0])
                    if len(splitted) == 2:
                        weight = float(splitted[1])
                    else:
                        weight = 0.
                    print >> fsm_fw, '%d\t0\t%d\t0\t%e' % (state_no + add,
                                                           _empty_, weight)
                    states.add(state_no)
                elif len(splitted) in (3, 4):
                    state_no_1 = int(splitted[0])
                    state_no_2 = int(splitted[1])
                    isym = int(splitted[2])
                    if len(splitted) == 4:
                        weight = float(splitted[3])
                    else:
                        weight = 0.
                    print >> fsm_fw, '%d\t%d\t%d\t0\t%e' % (
                        state_no_1 + add, state_no_2 + add, isym, weight)
                    states.add(state_no_1)
                    states.add(state_no_2)
                else:
                    raise ValueError("Unknown FSM line: %r" % line)
            add += max(states) + 1
        for i in separ:
            for j in ['user', 'operator']:
                da = '%s_%s' % (j, i)
                isym = isym_map[da]
                osym = da_map[da]
                print >> fsm_fw, '0\t0\t%d\t%d\t0' % (isym, osym)
        print >> fsm_fw, '0'
        fsm_fw.close()
        da_map.writeToFile(os.path.join(dataLm, 'dacoder.fsm.osym'))
        FSMCompile('-t', fsm, '-F', os.path.join(dataLm, 'dacoder.fsm'))
 def _readMaps(cls, fsm_fn, encoding='utf-8'):
     map_fn = os.path.splitext(fsm_fn)[0]+'.isym'
     map = SymMap.readFromFile(map_fn, encoding=encoding)
     return map