def _readMaps(cls, fsm_fn, encoding='utf-8'):
        imap_fn = os.path.splitext(fsm_fn)[0] + '.isym'
        imap = SymMap.readFromFile(imap_fn, encoding=encoding)

        omap_fn = os.path.splitext(fsm_fn)[0] + '.osym'
        omap = SymMap.readFromFile(omap_fn, encoding=encoding)
        return imap, omap
    def _readMaps(cls, fsm_fn, encoding='utf-8'):
        imap_fn = os.path.splitext(fsm_fn)[0]+'.isym'
        imap = SymMap.readFromFile(imap_fn, encoding=encoding)

        omap_fn = os.path.splitext(fsm_fn)[0]+'.osym'
        omap = SymMap.readFromFile(omap_fn, encoding=encoding)
        return imap, omap
    def genPadder(self):
        symbols = []
        maps2 = []
        count = 1
        _pop_ = 0

        if self.pteMap:
            pte_symbols = sorted(self.pteMap.values())
            pte_map = SymMap()
            pte_map2 = SymMap()
            for key, value in sorted(self.pteMap.items()):
                pte_map[value] = value+count
                pte_map2[key] = value+count
                _pop_ = max(_pop_, value+count)
            count += len(pte_map)
        else:
            pte_symbols = []
            pte_map = {}
            pte_map2 = {}

        for map in self.symbolMaps:
            if map is None:
                map = {}
            symbols.append(sorted(map.values()))
            new_map = SymMap()
            new_map2 = SymMap()
            for key, value in sorted(map.items()):
                new_map[value] = value+count
                new_map2[key] = value+count
                _pop_ = max(_pop_, value+count)
            count += len(new_map)
            maps2.append(new_map2)

        _pop_ += 1

        n_sets = sum(1 for i in maps2 if len(i)!=0)
        p_sets = 0

        end_state = 0
        state = 1
        yield end_state, state, 0, _pop_

        for key, value in sorted(pte_map.items()):
            yield state, state, value, value

        for map in maps2:
            if len(map) == 0:
                continue
            p_sets += 1
            if p_sets == n_sets:
                new_state = end_state
            else:
                new_state = state + 1
            for key, value in sorted(map.items()):
                yield state, new_state, value, value
            state += 1

        self._pop_ = _pop_
    def main(self, concept_map, sym_map, examples, output, threshold):
        self.conceptMap = SymMap.readFromFile(concept_map, format=(int, unicode)).inverse
        self.symMap = SymMap.readFromFile(sym_map, format=(int, unicode)).inverse

        examples = ADict.readFromFile(examples)
        examples = self.mapExamples(examples, threshold)

        key = lambda (k, v): (k[0], -v, k[1])
        examples.writeToFile(output, key=key)
    def main(self, concept_map, sym_map, examples, output, threshold):
        self.conceptMap = SymMap.readFromFile(concept_map,
                                              format=(int, unicode)).inverse
        self.symMap = SymMap.readFromFile(sym_map,
                                          format=(int, unicode)).inverse

        examples = ADict.readFromFile(examples)
        examples = self.mapExamples(examples, threshold)

        key = lambda (k, v): (k[0], -v, k[1])
        examples.writeToFile(output, key=key)
    def genRepeater(self):
        symbols = []
        maps2 = []
        count = 1

        if self.pteMap:
            pte_symbols = sorted(self.pteMap.values())
            pte_map = SymMap()
            pte_map2 = SymMap()
            for key, value in sorted(self.pteMap.items()):
                pte_map[value] = value+count
                pte_map2[key] = value+count
            count += len(pte_map)
        else:
            pte_symbols = []
            pte_map = {}
            pte_map2 = {}

        for map in self.symbolMaps:
            if map is None:
                map = {}
            symbols.append(sorted(map.values()))
            new_map = SymMap()
            new_map2 = SymMap()
            for key, value in sorted(map.items()):
                new_map[value] = value+count
                new_map2[key] = value+count
            count += len(new_map)
            maps2.append(new_map2)

        end_state = state = 0
        state_map = SymMap()
        if pte_map2:
            for value in pte_map2.values():
                state += 1
                yield end_state, state, value, value
                yield state, state, 0, value
                for map in maps2:
                    for sym in map.values():
                        yield state, state, sym, sym
                yield state, end_state, 0, 0
        else:
            for map in maps2:
                for sym in map.values():
                    yield state, state, sym, sym
예제 #7
0
    def genPadder(self):
        symbols = []
        maps2 = []
        count = 1
        _pop_ = 0

        if self.pteMap:
            pte_symbols = sorted(self.pteMap.values())
            pte_map = SymMap()
            pte_map2 = SymMap()
            for key, value in sorted(self.pteMap.items()):
                pte_map[value] = value + count
                pte_map2[key] = value + count
                _pop_ = max(_pop_, value + count)
            count += len(pte_map)
        else:
            pte_symbols = []
            pte_map = {}
            pte_map2 = {}

        for map in self.symbolMaps:
            if map is None:
                map = {}
            symbols.append(sorted(map.values()))
            new_map = SymMap()
            new_map2 = SymMap()
            for key, value in sorted(map.items()):
                new_map[value] = value + count
                new_map2[key] = value + count
                _pop_ = max(_pop_, value + count)
            count += len(new_map)
            maps2.append(new_map2)

        _pop_ += 1

        n_sets = sum(1 for i in maps2 if len(i) != 0)
        p_sets = 0

        end_state = 0
        state = 1
        yield end_state, state, 0, _pop_

        for key, value in sorted(pte_map.items()):
            yield state, state, value, value

        for map in maps2:
            if len(map) == 0:
                continue
            p_sets += 1
            if p_sets == n_sets:
                new_state = end_state
            else:
                new_state = state + 1
            for key, value in sorted(map.items()):
                yield state, new_state, value, value
            state += 1

        self._pop_ = _pop_
예제 #8
0
    def genRepeater(self):
        symbols = []
        maps2 = []
        count = 1

        if self.pteMap:
            pte_symbols = sorted(self.pteMap.values())
            pte_map = SymMap()
            pte_map2 = SymMap()
            for key, value in sorted(self.pteMap.items()):
                pte_map[value] = value + count
                pte_map2[key] = value + count
            count += len(pte_map)
        else:
            pte_symbols = []
            pte_map = {}
            pte_map2 = {}

        for map in self.symbolMaps:
            if map is None:
                map = {}
            symbols.append(sorted(map.values()))
            new_map = SymMap()
            new_map2 = SymMap()
            for key, value in sorted(map.items()):
                new_map[value] = value + count
                new_map2[key] = value + count
            count += len(new_map)
            maps2.append(new_map2)

        end_state = state = 0
        state_map = SymMap()
        if pte_map2:
            for value in pte_map2.values():
                state += 1
                yield end_state, state, value, value
                yield state, state, 0, value
                for map in maps2:
                    for sym in map.values():
                        yield state, state, sym, sym
                yield state, end_state, 0, 0
        else:
            for map in maps2:
                for sym in map.values():
                    yield state, state, sym, sym
 def _emptyMaps(cls):
     imap = SymMap()
     imap[cls.eps] = 0
     omap = SymMap()
     omap[cls.eps] = 0
     return imap, omap
    def main(
        self,
        model_dir,
        encoding=None,
        batch=False,
        omit_leaves=False,
        mlf=False,
        xml_dir=None,
        ref_mlf=None,
        skip_empty=False,
        input_chain=None,
        batch_size=100,
        no_underscores=True,
        force_pdt=False,
        pdt_dir=None,
    ):
        encoding = sys.stdout.encoding
        if encoding is None:
            if os.name == "nt":
                encoding = "cp1250"
            else:
                encoding = "iso-8859-2"

        datasets_fn = pjoin(model_dir, "datasets")
        datasets_fr = file(datasets_fn, "r")
        datasets = []
        isymMaps = []
        for i, line in enumerate(datasets_fr):
            line = line.strip()
            datasets.append(line)
            if line != "off":
                isymMaps.append(SymMap.readFromFile(pjoin(model_dir, "isym%d.map" % (i + 1,))))

        osymMap = SymMap.readFromFile(pjoin(model_dir, "osym.map"))

        if "signed" in datasets:
            da_type = "signed"
        else:
            da_type = "normalized"

        if not pdt_dir:
            pdt_dir = "/opt/PDT-2.0/tools/machine-annotation"

        if xml_dir:
            reader = input.MultiReader([xml_dir], input.DXMLReader)
            if force_pdt and "lemma" in datasets or "pos" in datasets:
                if os.name == "nt":
                    raise RuntimeError("Datasets 'lemma' and 'pos' are unsupported on Windows")
                reader = input.PDTReader(pdt_dir, reader, online=not batch)
        else:
            reader = input.StdInReader(encoding=encoding, type=da_type)
            if "lemma" in datasets or "pos" in datasets:
                if os.name == "nt":
                    raise RuntimeError("Datasets 'lemma' and 'pos' are unsupported on Windows")
                reader = input.PDTReader(pdt_dir, reader, online=not batch)
        if input_chain is not None:
            reader = input.InputChain(input_chain, reader)
        generator = input.InputGenerator(reader, datasets, datasets[0], noUnderscores=no_underscores)
        hypMLF = MLF()
        refMLF = MLF()
        if not batch:
            for da_fn, da_id, da_semantics, da_txts in generator.readInputs():
                da_empty = not bool(da_semantics.strip())
                if da_empty and skip_empty:
                    continue

                refMLF[da_id] = da_semantics + "\n"
                dcd = self.parseLine(model_dir, [da_txts], isymMaps, osymMap, omitLeaves=omit_leaves)
                if dcd:
                    if len(dcd) == 1:
                        hypMLF[da_id] = dcd[0].encode(encoding) + "\n"
                    else:
                        hypMLF[da_id] = ";".join(dcd).encode(encoding) + "\n"
                else:
                    hypMLF[da_id] = line + "\n"
                if not mlf:
                    print hypMLF[da_id],
        else:
            all_processed = False
            inputs = generator.readInputs()
            while not all_processed:
                da_count = 0
                lines = []
                ids = []
                for da_fn, da_id, da_semantics, da_txts in inputs:
                    da_empty = not bool(da_semantics.strip())
                    if da_empty and skip_empty:
                        continue

                    refMLF[da_id] = da_semantics + "\n"
                    lines.append(da_txts)
                    ids.append(da_id)
                    da_count += 1
                    if da_count >= batch_size:
                        break
                else:
                    all_processed = True

                dcd = self.parseLine(model_dir, lines, isymMaps, osymMap, omitLeaves=omit_leaves)
                for da_id, ol in zip(ids, dcd):
                    hypMLF[da_id] = ol.encode(encoding) + "\n"
                    if not mlf:
                        print hypMLF[da_id],
        if mlf:
            s = "".join(hypMLF.toLines())
            print s

        if ref_mlf:
            refMLF.writeToFile(ref_mlf)
    def fsmconvert(self, pteMapFn=None):
        sys.path.append('src')
        import fsm
        from svc.ui import gmtk

        max_states = int(self.settings['FSM_STATES'])
        cutoff_sym = float(self.settings['FSM_CUTOFF_SYM'])
        cutoff_trans = float(self.settings['FSM_CUTOFF_TRANS'])

        self.setCommonParams()
        FSM_DIR = self.settings['FSM_DIR']
        mkdirp(FSM_DIR)

        conceptMapFn = self.settings['CONCEPT_MAP']
        self.logger.debug("Reading concept map: %s", conceptMapFn)
        conceptMap = SymMap.readFromFile(conceptMapFn,
                                         format=(int, unicode)).inverse
        del conceptMap['_SINK_']
        #conceptMap = SymMap((k, v) for (k, v) in conceptMap.iteritems() if k in '_EMPTY_ GREETING DEPARTURE'.split())
        #conceptMap = SymMap((k, v) for (k, v) in conceptMap.iteritems() if k in '_EMPTY_ GREETING'.split())

        dataset_fn = os.path.join(FSM_DIR, 'datasets')
        dataset_fw = file(dataset_fn, 'w')
        sMaps = []
        for ds in [1, 2, 3]:
            ds_value = self.settings['S%d_DATASET' % ds]
            if ds_value != 'off':
                mapFn = self.settings['S%d_MAP' % ds]
                self.logger.debug("Reading s%d map: %s", ds, mapFn)
                map = SymMap.readFromFile(mapFn, format=(int, unicode)).inverse
                #map = SymMap((k, v) for (k, v) in map.iteritems() if k in u'dobrý den kdy jede _empty_ _unseen_'.split())
                sMaps.append(map)
            else:
                self.logger.debug("Dataset s%d is turned off", ds)
                sMaps.append(None)
            dataset_fw.write(ds_value + '\n')
        dataset_fw.close()

        if pteMapFn is not None:
            self.logger.debug("Reading pte map: %s", pteMapFn)
            pteMap = SymMap.readFromFile(pteMapFn, format=(unicode, int))
        else:
            pteMap = {}
        pteSymbols = pteMap.keys()

        mstr = os.path.join(self.settings['MSTR_DCD_DIR'], 'in.mstr')
        cppOptions = self.settings['CPP_OPTIONS'].split()
        workspace = gmtk.Workspace(cppOptions=cppOptions, readDTS=False)
        self.logger.info('Reading master file: %s', mstr)
        workspace.readMasterFile(mstr)

        self.logger.info('Creating FSM from arcs')

        self.logger.info('Total number of concepts: %d', len(conceptMap))
        #self.logger.info('Total number of symbols: %d', len(s1Map))

        stateGenerator = fsm.FSMGenerator(workspace,
                                          conceptMap,
                                          sMaps,
                                          cutoff_sym,
                                          cutoff_trans,
                                          max_states,
                                          pteSymbols=pteSymbols,
                                          logger=self.logger)
        stateGenerator.writeFSMRepeater(
            os.path.join(FSM_DIR, 'hvsrepeater.txt'))
        stateGenerator.writeFSMPadder(os.path.join(FSM_DIR, 'hvspadder.txt'))
        stateGenerator.writeFSM(os.path.join(FSM_DIR, 'hvsparser_pad.txt'))

        stateGenerator.stateMap.writeToFile(os.path.join(FSM_DIR, 'state.map'))
        stateGenerator.osymMap.writeToFile(os.path.join(FSM_DIR, 'osym.map'))
        for i, map in enumerate(stateGenerator.isymMaps):
            map.writeToFile(os.path.join(FSM_DIR, 'isym%d.map' % (i + 1, )))
        stateGenerator.ipteMap.writeToFile(os.path.join(FSM_DIR, 'pte.map'))

        self.fsmcompile()
예제 #12
0
 def createPTESymbolMap(self, pteSymbols):
     'pteSymbols - Pass-through-empty symbols'
     ret = SymMap()
     for i, sym in enumerate(pteSymbols):
         ret[sym] = i
     return ret
 def setupMaps(self, conceptMap, symMap):
     self.conceptMap = SymMap.readFromFile(conceptMap,
                                           format=(int, unicode)).inverse
     self.symMap = SymMap.readFromFile(symMap,
                                       format=(int, unicode)).inverse
    def makeFMStxt(self, separ, dataLm):
        isym_map = None
        isym_fn = os.path.join(dataLm, 'dacoder.fsm.isym')
        fsm = os.path.join(dataLm, 'dacoder.fsm.txt')
        fsm_fw = file(fsm, 'w')
        add = 1
        da_map = SymMap.readFromFile(
            os.path.join(dataLm, 'dialogue_act.fsm.isym'))
        for da in separ:
            fn = self.mapTXT(dataLm, da)
            fn_lm = self.mapLM(dataLm, da)
            fn_fsm = self.mapFSM(dataLm, da)

            da_num_op = da_map['operator_' + da]
            da_num_us = da_map['user_' + da]

            if isym_map is None:
                self.convertLMtoFSM(fn_lm, isym_fn)
                isym_map = SymMap.readFromFile(isym_fn)
                _empty_ = isym_map.add('_empty_')
                _operator_ = isym_map.add('_operator_')
                _user_ = isym_map.add('_user_')
                for i in separ:
                    isym_map.add('user_%s' % i)
                    isym_map.add('operator_%s' % i)
                isym_map.writeToFile(isym_fn)

            s0 = None
            states = set()
            for line in self.convertLMtoFSM(fn_lm):
                # GAWK hack
                line = line.replace(',', '.')
                splitted = line.split()

                if s0 is None:
                    s0 = int(splitted[0]) + add
                    print >> fsm_fw, '0\t%d\t%d\t%d\t0' % (
                        s0,
                        _operator_,
                        da_num_op,
                    )
                    print >> fsm_fw, '0\t%d\t%d\t%d\t0' % (
                        s0,
                        _user_,
                        da_num_us,
                    )

                if len(splitted) in (1, 2):
                    state_no = int(splitted[0])
                    if len(splitted) == 2:
                        weight = float(splitted[1])
                    else:
                        weight = 0.
                    print >> fsm_fw, '%d\t0\t%d\t0\t%e' % (state_no + add,
                                                           _empty_, weight)
                    states.add(state_no)
                elif len(splitted) in (3, 4):
                    state_no_1 = int(splitted[0])
                    state_no_2 = int(splitted[1])
                    isym = int(splitted[2])
                    if len(splitted) == 4:
                        weight = float(splitted[3])
                    else:
                        weight = 0.
                    print >> fsm_fw, '%d\t%d\t%d\t0\t%e' % (
                        state_no_1 + add, state_no_2 + add, isym, weight)
                    states.add(state_no_1)
                    states.add(state_no_2)
                else:
                    raise ValueError("Unknown FSM line: %r" % line)
            add += max(states) + 1
        for i in separ:
            for j in ['user', 'operator']:
                da = '%s_%s' % (j, i)
                isym = isym_map[da]
                osym = da_map[da]
                print >> fsm_fw, '0\t0\t%d\t%d\t0' % (isym, osym)
        print >> fsm_fw, '0'
        fsm_fw.close()
        da_map.writeToFile(os.path.join(dataLm, 'dacoder.fsm.osym'))
        FSMCompile('-t', fsm, '-F', os.path.join(dataLm, 'dacoder.fsm'))
    def main(self,
             model_dir,
             encoding=None,
             batch=False,
             omit_leaves=False,
             mlf=False,
             xml_dir=None,
             ref_mlf=None,
             skip_empty=False,
             input_chain=None,
             batch_size=100,
             no_underscores=True,
             force_pdt=False,
             pdt_dir=None):
        encoding = sys.stdout.encoding
        if encoding is None:
            if os.name == 'nt':
                encoding = 'cp1250'
            else:
                encoding = 'iso-8859-2'

        datasets_fn = pjoin(model_dir, 'datasets')
        datasets_fr = file(datasets_fn, 'r')
        datasets = []
        isymMaps = []
        for i, line in enumerate(datasets_fr):
            line = line.strip()
            datasets.append(line)
            if line != 'off':
                isymMaps.append(
                    SymMap.readFromFile(
                        pjoin(model_dir, 'isym%d.map' % (i + 1, ))))

        osymMap = SymMap.readFromFile(pjoin(model_dir, 'osym.map'))

        if 'signed' in datasets:
            da_type = 'signed'
        else:
            da_type = 'normalized'

        if not pdt_dir:
            pdt_dir = '/opt/PDT-2.0/tools/machine-annotation'

        if xml_dir:
            reader = input.MultiReader([xml_dir], input.DXMLReader)
            if force_pdt and 'lemma' in datasets or 'pos' in datasets:
                if os.name == 'nt':
                    raise RuntimeError(
                        "Datasets 'lemma' and 'pos' are unsupported on Windows"
                    )
                reader = input.PDTReader(pdt_dir, reader, online=not batch)
        else:
            reader = input.StdInReader(encoding=encoding, type=da_type)
            if 'lemma' in datasets or 'pos' in datasets:
                if os.name == 'nt':
                    raise RuntimeError(
                        "Datasets 'lemma' and 'pos' are unsupported on Windows"
                    )
                reader = input.PDTReader(pdt_dir, reader, online=not batch)
        if input_chain is not None:
            reader = input.InputChain(input_chain, reader)
        generator = input.InputGenerator(reader,
                                         datasets,
                                         datasets[0],
                                         noUnderscores=no_underscores)
        hypMLF = MLF()
        refMLF = MLF()
        if not batch:
            for da_fn, da_id, da_semantics, da_txts in generator.readInputs():
                da_empty = not bool(da_semantics.strip())
                if (da_empty and skip_empty):
                    continue

                refMLF[da_id] = da_semantics + '\n'
                dcd = self.parseLine(model_dir, [da_txts],
                                     isymMaps,
                                     osymMap,
                                     omitLeaves=omit_leaves)
                if dcd:
                    if len(dcd) == 1:
                        hypMLF[da_id] = dcd[0].encode(encoding) + '\n'
                    else:
                        hypMLF[da_id] = ';'.join(dcd).encode(encoding) + '\n'
                else:
                    hypMLF[da_id] = line + '\n'
                if not mlf:
                    print hypMLF[da_id],
        else:
            all_processed = False
            inputs = generator.readInputs()
            while not all_processed:
                da_count = 0
                lines = []
                ids = []
                for da_fn, da_id, da_semantics, da_txts in inputs:
                    da_empty = not bool(da_semantics.strip())
                    if (da_empty and skip_empty):
                        continue

                    refMLF[da_id] = da_semantics + '\n'
                    lines.append(da_txts)
                    ids.append(da_id)
                    da_count += 1
                    if da_count >= batch_size:
                        break
                else:
                    all_processed = True

                dcd = self.parseLine(model_dir,
                                     lines,
                                     isymMaps,
                                     osymMap,
                                     omitLeaves=omit_leaves)
                for da_id, ol in zip(ids, dcd):
                    hypMLF[da_id] = ol.encode(encoding) + '\n'
                    if not mlf:
                        print hypMLF[da_id],
        if mlf:
            s = ''.join(hypMLF.toLines())
            print s

        if ref_mlf:
            refMLF.writeToFile(ref_mlf)
 def setupMaps(self, conceptMap, symMap):
     self.conceptMap = SymMap.readFromFile(conceptMap, format=(int, unicode)).inverse
     self.symMap = SymMap.readFromFile(symMap, format=(int, unicode)).inverse
예제 #17
0
 def convertStateMap(self, map):
     ret = SymMap()
     for k, v in map.iteritems():
         ret[self.strState(k)] = v
     return ret
    def makeFMStxt(self, separ, dataLm):
        isym_map = None
        isym_fn = os.path.join(dataLm, 'dacoder.fsm.isym')
        fsm = os.path.join(dataLm, 'dacoder.fsm.txt')
        fsm_fw = file(fsm, 'w')
        add = 1
        da_map = SymMap.readFromFile(os.path.join(dataLm, 'dialogue_act.fsm.isym'))
        for da in separ:
            fn = self.mapTXT(dataLm, da)
            fn_lm = self.mapLM(dataLm, da)
            fn_fsm = self.mapFSM(dataLm, da)

            da_num_op = da_map['operator_'+da]
            da_num_us = da_map['user_'+da]

            if isym_map is None:
                self.convertLMtoFSM(fn_lm, isym_fn)
                isym_map = SymMap.readFromFile(isym_fn)
                _empty_ = isym_map.add('_empty_')
                _operator_ = isym_map.add('_operator_')
                _user_ = isym_map.add('_user_')
                for i in separ:
                    isym_map.add('user_%s' % i)
                    isym_map.add('operator_%s' % i)
                isym_map.writeToFile(isym_fn)

            s0 = None
            states = set()
            for line in self.convertLMtoFSM(fn_lm):
                # GAWK hack
                line = line.replace(',', '.')
                splitted = line.split()

                if s0 is None:
                    s0 = int(splitted[0])+add
                    print >> fsm_fw, '0\t%d\t%d\t%d\t0' % (s0, _operator_, da_num_op, )
                    print >> fsm_fw, '0\t%d\t%d\t%d\t0' % (s0, _user_, da_num_us, )

                if len(splitted) in (1, 2):
                    state_no = int(splitted[0])
                    if len(splitted) == 2:
                        weight = float(splitted[1])
                    else:
                        weight = 0.
                    print >> fsm_fw, '%d\t0\t%d\t0\t%e' % (state_no + add, _empty_, weight)
                    states.add(state_no)
                elif len(splitted) in (3, 4):
                    state_no_1 = int(splitted[0])
                    state_no_2 = int(splitted[1])
                    isym = int(splitted[2])
                    if len(splitted) == 4:
                        weight = float(splitted[3])
                    else:
                        weight = 0.
                    print >> fsm_fw, '%d\t%d\t%d\t0\t%e' % (state_no_1+add, state_no_2+add, isym, weight)
                    states.add(state_no_1)
                    states.add(state_no_2)
                else:
                    raise ValueError("Unknown FSM line: %r" % line)
            add += max(states)+1
        for i in separ:
            for j in ['user', 'operator']:
                da = '%s_%s' % (j, i)
                isym = isym_map[da]
                osym = da_map[da]
                print >> fsm_fw, '0\t0\t%d\t%d\t0' % (isym, osym)
        print >> fsm_fw, '0'
        fsm_fw.close()
        da_map.writeToFile(os.path.join(dataLm, 'dacoder.fsm.osym'))
        FSMCompile('-t', fsm, '-F', os.path.join(dataLm, 'dacoder.fsm'))
예제 #19
0
    def genStates(self):
        processed = set()
        backoff_stat = ADict(default=set)

        osym_map = SymMap()
        osym_map['epsilon'] = 0

        pop_Given_C = self.workspace[gmtk.SCPT, 'popGivenC1C2C3C4']
        push_Given_C = self.workspace[gmtk.SCPT, 'pushGivenC1C2C3C4']

        c1_Given_C234 = self.workspace[gmtk.SCPT, 'concept1GivenC2C3C4']
        c1_Given_C23 = self.workspace[gmtk.SCPT, 'concept1GivenC2C3']
        c1_Given_C2 = self.workspace[gmtk.DCPT, 'concept1GivenC2']
        c1_backoff = self.workspace[gmtk.DT, 'backoffC2C3C4']
        c2_Given_C = self.workspace[gmtk.SCPT, 'concept2GivenC3C4']

        s1_Given_C1234 = self.workspace[gmtk.SCPT, 's1GivenC1C2C3C4']
        s1_Given_C123 = self.workspace[gmtk.SCPT, 's1GivenC1C2C3']
        s1_Given_C12 = self.workspace[gmtk.SCPT, 's1GivenC1C2']
        s1_Given_C1 = self.workspace[gmtk.DCPT, 's1GivenC1']
        s1_Unigram = self.workspace[gmtk.DCPT, 's1Unigram']
        s1_backoff = self.workspace[gmtk.DT, 'backoffC1C2C3C4']

        s2_Given_C1234 = self.workspace[gmtk.SCPT, 's2GivenC1C2C3C4']
        s2_Given_C123 = self.workspace[gmtk.SCPT, 's2GivenC1C2C3']
        s2_Given_C12 = self.workspace[gmtk.SCPT, 's2GivenC1C2']
        s2_Given_C1 = self.workspace[gmtk.DCPT, 's2GivenC1']
        s2_Unigram = self.workspace[gmtk.DCPT, 's2Unigram']

        s3_Given_C1234 = self.workspace[gmtk.SCPT, 's3GivenC1C2C3C4']
        s3_Given_C123 = self.workspace[gmtk.SCPT, 's3GivenC1C2C3']
        s3_Given_C12 = self.workspace[gmtk.SCPT, 's3GivenC1C2']
        s3_Given_C1 = self.workspace[gmtk.DCPT, 's3GivenC1']
        s3_Unigram = self.workspace[gmtk.DCPT, 's3Unigram']

        conceptMap = self.conceptMap
        _EMPTY_ = conceptMap[EMPTY_CONCEPT]
        _DUMMY_ = conceptMap.get(DUMMY_CONCEPT, None)
        allConcepts = sorted(conceptMap.values())

        symbols = []
        maps = []
        maps2 = []
        count = 1

        pte_map = SymMap()
        pte_map2 = SymMap()
        if self.pteMap:
            pte_symbols = sorted(self.pteMap.values())
            for key, value in sorted(self.pteMap.items()):
                pte_map[value] = value + count
                pte_map2[key] = value + count
            count += len(pte_map)
        else:
            pte_symbols = []

        for map in self.symbolMaps:
            if map is None:
                map = {}
            symbols.append(sorted(map.values()))
            new_map = SymMap()
            new_map2 = SymMap()
            for key, value in sorted(map.items()):
                new_map[value] = value + count
                new_map2[key] = value + count
            count += len(new_map)
            maps.append(new_map)
            maps2.append(new_map2)

        s0 = (_EMPTY_, ) * 4
        s0_expanded = False

        cutoff_sym = self.cutoff_sym
        cutoff_trans = self.cutoff_trans
        max_states = self.max_states

        logger = self.logger

        stack = [(0, 0, s0)]
        stack_set = set([s0])

        state_map = SymMap()
        state_map[s0] = 0

        _pop_ = self._pop_
        interim_counter = 0

        n_arcs = 0
        while stack:
            if max_states is None:
                total_states = len(state_map) - interim_counter
            else:
                total_states = max_states
            if logger is not None:
                logger.debug(
                    '   #states (unexpanded/total) %.2f%%, %d/%d, #arcs %d',
                    100. * len(processed) / total_states,
                    total_states - len(processed), total_states, n_arcs)

            c_t_backoff, c_t_dist, c_t = stack.pop(0)
            backoff_stat[c_t_backoff].add(c_t)
            if logger is not None:
                logger.debug('     %.2f: %s, backoff=%d', c_t_dist,
                             self.strState(c_t), c_t_backoff)
            state_c_t = state_map[c_t]
            processed.add(c_t)
            stack_set.remove(c_t)

            ret = []

            pop_pmf = list(pop_Given_C[:c_t[0], c_t[1], c_t[2], c_t[3]])
            push_pmf = list(push_Given_C[:c_t[0], c_t[1], c_t[2], c_t[3]])

            for pop in range(0, MAX_POP + 1):
                prob_pop = pop_pmf[pop]

                if prob_pop <= cutoff_trans:
                    continue

                interim_counter += 1
                c_inter = c_t[pop:] + (_EMPTY_, ) * pop
                osym = ')' * pop
                if not osym:
                    osym = 'epsilon'
                ret.append((prob_pop, c_t, (c_t, c_inter), _pop_, osym))

                for push in range(0, MAX_PUSH + 1):
                    prob_push = push_pmf[push]

                    if push == 0:
                        to_push_all = [()]
                    else:
                        to_push_all = cartezian(*[allConcepts] * push)

                    for to_push in to_push_all:
                        c_new = (to_push + c_inter)[:DEPTH]

                        if (c_t == c_new) and not (push == pop == 0):
                            continue

                        if _DUMMY_ in c_new[1:]:
                            continue

                        # Output symbol
                        osym = ''
                        for push_concept in reversed(to_push):
                            osym += conceptMap.inverse[push_concept] + '('
                        if not osym:
                            osym = 'epsilon'

                        # Smoothing
                        backoff = c1_backoff[c_new[1], c_new[2], c_new[3]]
                        if backoff == 0:
                            c1_pmf = c1_Given_C234[:c_new[1], c_new[2],
                                                   c_new[3]]
                        elif backoff == 1:
                            c1_pmf = c1_Given_C23[:c_new[1], c_new[2]]
                        else:
                            c1_pmf = c1_Given_C2[:c_new[1]]
                        c2_pmf = c2_Given_C[:c_new[2], c_new[3]]

                        if push == 0:
                            prob_new_c = 1.0
                        elif push == 1:
                            prob_new_c = c1_pmf[to_push[0]]
                        elif push == 2:
                            prob_new_c = c1_pmf[to_push[0]] * c2_pmf[
                                to_push[1]]

                        prob_trans = prob_push * prob_new_c
                        # Do cut-off
                        if prob_trans <= cutoff_trans:
                            continue

                        # Smoothing
                        backoff = s1_backoff[c_new[0], c_new[1], c_new[2],
                                             c_new[3]]
                        if backoff == 0:
                            s_pmf = [
                                list(s1_Given_C1234[:c_new[0], c_new[1],
                                                    c_new[2], c_new[3]]),
                                list(s2_Given_C1234[:c_new[0], c_new[1],
                                                    c_new[2], c_new[3]]),
                                list(s3_Given_C1234[:c_new[0], c_new[1],
                                                    c_new[2], c_new[3]])
                            ]
                        elif backoff == 1:
                            s_pmf = [
                                list(s1_Given_C123[:c_new[0], c_new[1],
                                                   c_new[2]]),
                                list(s2_Given_C123[:c_new[0], c_new[1],
                                                   c_new[2]]),
                                list(s3_Given_C123[:c_new[0], c_new[1],
                                                   c_new[2]])
                            ]
                        elif backoff == 2:
                            s_pmf = [
                                list(s1_Given_C12[:c_new[0], c_new[1]]),
                                list(s2_Given_C12[:c_new[0], c_new[1]]),
                                list(s3_Given_C12[:c_new[0], c_new[1]])
                            ]
                        elif backoff == 3:
                            s_pmf = [
                                list(s1_Given_C1[:c_new[0]]),
                                list(s2_Given_C1[:c_new[0]]),
                                list(s3_Given_C1[:c_new[0]])
                            ]
                        else:
                            s_pmf = [
                                list(s1_Unigram),
                                list(s2_Unigram),
                                list(s3_Unigram)
                            ]

                        if c_new not in processed and c_new not in stack_set:
                            stack_set.add(c_new)
                            c_new_dist = (c_t_dist - log(prob_trans))
                            insort(
                                stack,
                                (backoff, c_t_dist - log(prob_trans), c_new))

                        c_next = (c_t, c_inter)

                        if pte_symbols and c_inter == (
                                _EMPTY_, ) * 4 and push != 0:
                            for pte_sym in pte_symbols:
                                prob_ptesym = 1.0
                                pte_sym = pte_map[pte_sym]
                                pte_osym = pte_map2.inverse[pte_sym]
                                ret.append((prob_trans * prob_ptesym, c_next,
                                            c_new, pte_sym, pte_osym))
                            prob_trans = 1.0
                            c_next = c_new

                        for sym, map, pmf in zip(symbols, maps, s_pmf):
                            if map is None:
                                continue

                            for isym in sym:
                                prob_isym = pmf[isym]

                                # Do cut-off
                                if prob_isym <= cutoff_sym:
                                    continue
                                else:
                                    isym = map[isym]
                                    ret.append((prob_trans * prob_isym, c_next,
                                                c_new, isym, osym))

                            # For symbols other than the first
                            prob_trans = 1.0
                            c_next = c_new
                            osym = 'epsilon'

            for prob, c_t, c_new, isym, osym in ret:
                state_c_new = state_map.add(c_new)
                state_c_t = state_map.add(c_t)

                osym = osym_map.add(osym)

                n_arcs += 1

                yield state_c_t, state_c_new, isym, osym, prob

            if max_states is not None and len(processed) >= max_states:
                break

        self.stateMap = self.convertStateMap(state_map)
        self.osymMap = osym_map
        self.isymMaps = maps2
        self.ipteMap = pte_map2

        backoff_stat = ADict(
            (k, len(v)) for (k, v) in backoff_stat.iteritems())

        if logger is not None:
            logger.debug('Backoff statistics:')
            logger.debug('===================')
            total = backoff_stat.sum()
            for key, value in sorted(backoff_stat.items()):
                logger.debug('  backoff=%d: #%d (%.2f%%)', key, value,
                             100. * value / total)
    def fsmconvert(self, pteMapFn=None):
        sys.path.append('src')
        import fsm
        from svc.ui import gmtk

        max_states = int(self.settings['FSM_STATES'])
        cutoff_sym = float(self.settings['FSM_CUTOFF_SYM'])
        cutoff_trans = float(self.settings['FSM_CUTOFF_TRANS'])

        self.setCommonParams()
        FSM_DIR = self.settings['FSM_DIR']
        mkdirp(FSM_DIR)

        conceptMapFn = self.settings['CONCEPT_MAP']
        self.logger.debug("Reading concept map: %s", conceptMapFn)
        conceptMap = SymMap.readFromFile(conceptMapFn, format=(int, unicode)).inverse
        del conceptMap['_SINK_']
        #conceptMap = SymMap((k, v) for (k, v) in conceptMap.iteritems() if k in '_EMPTY_ GREETING DEPARTURE'.split())
        #conceptMap = SymMap((k, v) for (k, v) in conceptMap.iteritems() if k in '_EMPTY_ GREETING'.split())

        dataset_fn = os.path.join(FSM_DIR, 'datasets')
        dataset_fw = file(dataset_fn, 'w')
        sMaps = []
        for ds in [1, 2, 3]:
            ds_value = self.settings['S%d_DATASET' % ds]
            if ds_value != 'off':
                mapFn = self.settings['S%d_MAP'% ds]
                self.logger.debug("Reading s%d map: %s", ds, mapFn)
                map = SymMap.readFromFile(mapFn, format=(int, unicode)).inverse
                #map = SymMap((k, v) for (k, v) in map.iteritems() if k in u'dobrý den kdy jede _empty_ _unseen_'.split())
                sMaps.append(map)
            else:
                self.logger.debug("Dataset s%d is turned off", ds)
                sMaps.append(None)
            dataset_fw.write(ds_value + '\n')
        dataset_fw.close()

        if pteMapFn is not None:
            self.logger.debug("Reading pte map: %s", pteMapFn)
            pteMap = SymMap.readFromFile(pteMapFn, format=(unicode, int))
        else:
            pteMap = {}
        pteSymbols = pteMap.keys()
        
        mstr = os.path.join(self.settings['MSTR_DCD_DIR'], 'in.mstr')
        cppOptions = self.settings['CPP_OPTIONS'].split()
        workspace = gmtk.Workspace(cppOptions=cppOptions, readDTS=False)
        self.logger.info('Reading master file: %s', mstr)
        workspace.readMasterFile(mstr)

        self.logger.info('Creating FSM from arcs')

        self.logger.info('Total number of concepts: %d', len(conceptMap))
        #self.logger.info('Total number of symbols: %d', len(s1Map))

        stateGenerator = fsm.FSMGenerator(workspace, conceptMap, sMaps,
                                    cutoff_sym, cutoff_trans, max_states,
                                    pteSymbols=pteSymbols,
                                    logger=self.logger)
        stateGenerator.writeFSMRepeater(os.path.join(FSM_DIR, 'hvsrepeater.txt'))
        stateGenerator.writeFSMPadder(os.path.join(FSM_DIR, 'hvspadder.txt'))
        stateGenerator.writeFSM(os.path.join(FSM_DIR, 'hvsparser_pad.txt'))

        stateGenerator.stateMap.writeToFile(os.path.join(FSM_DIR, 'state.map'))
        stateGenerator.osymMap.writeToFile(os.path.join(FSM_DIR, 'osym.map'))
        for i, map in enumerate(stateGenerator.isymMaps):
            map.writeToFile(os.path.join(FSM_DIR, 'isym%d.map' % (i+1, )))
        stateGenerator.ipteMap.writeToFile(os.path.join(FSM_DIR, 'pte.map'))

        self.fsmcompile()
 def _readMaps(cls, fsm_fn, encoding='utf-8'):
     map_fn = os.path.splitext(fsm_fn)[0] + '.isym'
     map = SymMap.readFromFile(map_fn, encoding=encoding)
     return map
    def genStates(self):
        processed = set()
        backoff_stat = ADict(default=set)

        osym_map = SymMap()
        osym_map['epsilon'] = 0

        pop_Given_C = self.workspace[gmtk.SCPT, 'popGivenC1C2C3C4']
        push_Given_C = self.workspace[gmtk.SCPT, 'pushGivenC1C2C3C4']

        c1_Given_C234 = self.workspace[gmtk.SCPT, 'concept1GivenC2C3C4']
        c1_Given_C23 = self.workspace[gmtk.SCPT, 'concept1GivenC2C3']
        c1_Given_C2 = self.workspace[gmtk.DCPT, 'concept1GivenC2']
        c1_backoff = self.workspace[gmtk.DT, 'backoffC2C3C4']
        c2_Given_C = self.workspace[gmtk.SCPT, 'concept2GivenC3C4']

        s1_Given_C1234 = self.workspace[gmtk.SCPT, 's1GivenC1C2C3C4']
        s1_Given_C123 = self.workspace[gmtk.SCPT, 's1GivenC1C2C3']
        s1_Given_C12 = self.workspace[gmtk.SCPT, 's1GivenC1C2']
        s1_Given_C1 = self.workspace[gmtk.DCPT, 's1GivenC1']
        s1_Unigram = self.workspace[gmtk.DCPT, 's1Unigram']
        s1_backoff = self.workspace[gmtk.DT, 'backoffC1C2C3C4']

        s2_Given_C1234 = self.workspace[gmtk.SCPT, 's2GivenC1C2C3C4']
        s2_Given_C123 = self.workspace[gmtk.SCPT, 's2GivenC1C2C3']
        s2_Given_C12 = self.workspace[gmtk.SCPT, 's2GivenC1C2']
        s2_Given_C1 = self.workspace[gmtk.DCPT, 's2GivenC1']
        s2_Unigram = self.workspace[gmtk.DCPT, 's2Unigram']

        s3_Given_C1234 = self.workspace[gmtk.SCPT, 's3GivenC1C2C3C4']
        s3_Given_C123 = self.workspace[gmtk.SCPT, 's3GivenC1C2C3']
        s3_Given_C12 = self.workspace[gmtk.SCPT, 's3GivenC1C2']
        s3_Given_C1 = self.workspace[gmtk.DCPT, 's3GivenC1']
        s3_Unigram = self.workspace[gmtk.DCPT, 's3Unigram']

        conceptMap = self.conceptMap
        _EMPTY_ = conceptMap[EMPTY_CONCEPT]
        _DUMMY_ = conceptMap.get(DUMMY_CONCEPT, None)
        allConcepts = sorted(conceptMap.values())

        symbols = []
        maps = []
        maps2 = []
        count = 1

        pte_map = SymMap()
        pte_map2 = SymMap()
        if self.pteMap:
            pte_symbols = sorted(self.pteMap.values())
            for key, value in sorted(self.pteMap.items()):
                pte_map[value] = value+count
                pte_map2[key] = value+count
            count += len(pte_map)
        else:
            pte_symbols = []

        for map in self.symbolMaps:
            if map is None:
                map = {}
            symbols.append(sorted(map.values()))
            new_map = SymMap()
            new_map2 = SymMap()
            for key, value in sorted(map.items()):
                new_map[value] = value+count
                new_map2[key] = value+count
            count += len(new_map)
            maps.append(new_map)
            maps2.append(new_map2)

        s0 = (_EMPTY_,)*4
        s0_expanded = False

        cutoff_sym = self.cutoff_sym
        cutoff_trans = self.cutoff_trans
        max_states = self.max_states

        logger = self.logger

        stack = [(0, 0, s0)]
        stack_set = set([s0])

        state_map = SymMap()
        state_map[s0] = 0

        _pop_ = self._pop_
        interim_counter = 0

        n_arcs = 0
        while stack:
            if max_states is None:
                total_states = len(state_map) - interim_counter
            else:
                total_states = max_states
            if logger is not None:
                logger.debug('   #states (unexpanded/total) %.2f%%, %d/%d, #arcs %d', 100.*len(processed)/total_states, total_states-len(processed), total_states, n_arcs)

            c_t_backoff, c_t_dist, c_t = stack.pop(0)
            backoff_stat[c_t_backoff].add(c_t)
            if logger is not None:
                logger.debug('     %.2f: %s, backoff=%d', c_t_dist, self.strState(c_t), c_t_backoff)
            state_c_t = state_map[c_t]
            processed.add(c_t)
            stack_set.remove(c_t)

            ret = []

            pop_pmf = list(pop_Given_C[: c_t[0], c_t[1], c_t[2], c_t[3]])
            push_pmf = list(push_Given_C[: c_t[0], c_t[1], c_t[2], c_t[3]])

            for pop in range(0, MAX_POP+1):
                prob_pop = pop_pmf[pop]

                if prob_pop <= cutoff_trans:
                    continue

                interim_counter += 1
                c_inter = c_t[pop:] + (_EMPTY_, ) * pop
                osym = ')'*pop
                if not osym:
                    osym = 'epsilon'
                ret.append( (prob_pop, c_t, (c_t, c_inter), _pop_, osym) )

                for push in range(0, MAX_PUSH+1):
                    prob_push = push_pmf[push]

                    if push == 0:
                        to_push_all = [()]
                    else:
                        to_push_all = cartezian(*[allConcepts]*push)

                    for to_push in to_push_all:
                        c_new = (to_push + c_inter)[:DEPTH]

                        if (c_t == c_new) and not (push == pop == 0):
                            continue

                        if _DUMMY_ in c_new[1:]:
                            continue

                        # Output symbol
                        osym = ''
                        for push_concept in reversed(to_push):
                            osym += conceptMap.inverse[push_concept]+'('
                        if not osym:
                            osym = 'epsilon'

                        # Smoothing
                        backoff = c1_backoff[c_new[1], c_new[2], c_new[3]]
                        if backoff == 0:
                            c1_pmf = c1_Given_C234[: c_new[1], c_new[2], c_new[3]]
                        elif backoff == 1:
                            c1_pmf = c1_Given_C23[: c_new[1], c_new[2]]
                        else:
                            c1_pmf = c1_Given_C2[: c_new[1]]
                        c2_pmf = c2_Given_C[: c_new[2], c_new[3]]

                        if push == 0:
                            prob_new_c = 1.0
                        elif push == 1:
                            prob_new_c = c1_pmf[to_push[0]]
                        elif push == 2:
                            prob_new_c = c1_pmf[to_push[0]] * c2_pmf[to_push[1]]

                        prob_trans = prob_push * prob_new_c
                        # Do cut-off
                        if prob_trans <= cutoff_trans:
                            continue

                        # Smoothing
                        backoff = s1_backoff[c_new[0], c_new[1], c_new[2], c_new[3]]
                        if backoff == 0:
                            s_pmf = [list(s1_Given_C1234[: c_new[0], c_new[1], c_new[2], c_new[3]]),
                                     list(s2_Given_C1234[: c_new[0], c_new[1], c_new[2], c_new[3]]),
                                     list(s3_Given_C1234[: c_new[0], c_new[1], c_new[2], c_new[3]])]
                        elif backoff == 1:
                            s_pmf = [list(s1_Given_C123[: c_new[0], c_new[1], c_new[2]]),
                                     list(s2_Given_C123[: c_new[0], c_new[1], c_new[2]]),
                                     list(s3_Given_C123[: c_new[0], c_new[1], c_new[2]])]
                        elif backoff == 2:
                            s_pmf = [list(s1_Given_C12[: c_new[0], c_new[1]]),
                                     list(s2_Given_C12[: c_new[0], c_new[1]]),
                                     list(s3_Given_C12[: c_new[0], c_new[1]])]
                        elif backoff == 3:
                            s_pmf = [list(s1_Given_C1[: c_new[0]]),
                                     list(s2_Given_C1[: c_new[0]]),
                                     list(s3_Given_C1[: c_new[0]])]
                        else:
                            s_pmf = [list(s1_Unigram),
                                     list(s2_Unigram),
                                     list(s3_Unigram)]

                        if c_new not in processed and c_new not in stack_set:
                            stack_set.add(c_new)
                            c_new_dist = (c_t_dist-log(prob_trans))
                            insort(stack, (backoff, c_t_dist-log(prob_trans), c_new))

                        c_next = (c_t, c_inter)

                        if pte_symbols and c_inter == (_EMPTY_,)*4 and push != 0:
                            for pte_sym in pte_symbols:
                                prob_ptesym = 1.0
                                pte_sym = pte_map[pte_sym]
                                pte_osym = pte_map2.inverse[pte_sym]
                                ret.append( (prob_trans*prob_ptesym, c_next, c_new, pte_sym, pte_osym) )
                            prob_trans = 1.0
                            c_next = c_new

                        for sym, map, pmf in zip(symbols, maps, s_pmf):
                            if map is None:
                                continue

                            for isym in sym:
                                prob_isym = pmf[isym]

                                # Do cut-off
                                if prob_isym <= cutoff_sym:
                                    continue
                                else:
                                    isym = map[isym]
                                    ret.append( (prob_trans*prob_isym, c_next, c_new, isym, osym) )

                            # For symbols other than the first
                            prob_trans = 1.0
                            c_next = c_new
                            osym = 'epsilon'

            for prob, c_t, c_new, isym, osym in ret:
                state_c_new = state_map.add(c_new)
                state_c_t = state_map.add(c_t)

                osym = osym_map.add(osym)

                n_arcs += 1

                yield state_c_t, state_c_new, isym, osym, prob

            if max_states is not None and len(processed) >= max_states:
                break

        self.stateMap = self.convertStateMap(state_map)
        self.osymMap = osym_map
        self.isymMaps = maps2
        self.ipteMap = pte_map2

        backoff_stat = ADict((k, len(v)) for (k,v) in backoff_stat.iteritems())

        if logger is not None:
            logger.debug('Backoff statistics:')
            logger.debug('===================')
            total = backoff_stat.sum()
            for key, value in sorted(backoff_stat.items()):
                logger.debug('  backoff=%d: #%d (%.2f%%)', key, value, 100.*value/total)
 def _emptyMaps(cls):
     map = SymMap()
     map[cls.eps] = 0
     return map
예제 #24
0
 def loadMaps(self, maps):
     ret = []
     for m in maps:
         fn = pjoin(self.fsm_dir, m)
         ret.append( SymMap.readFromFile(fn) )
     return ret
 def _readMaps(cls, fsm_fn, encoding='utf-8'):
     map_fn = os.path.splitext(fsm_fn)[0]+'.isym'
     map = SymMap.readFromFile(map_fn, encoding=encoding)
     return map