def __extractIntentionsFile(self, fn, data_dir, out_fn): self.logger.info('Making intention level from: %s', fn) if not os.path.exists(fn): self.logger.warn('File does not exists, skipping: %s', fn) return mlf = ConceptMLF.readFromFile(fn) fw = MLF() processor = NadraziProcessor([data_dir]) for mlf_fn, tree in mlf.iteritems(): fw[mlf_fn] = ['%s\n' % processor.process(mlf_fn, tree)] dirname = os.path.split(fn)[0] fw.writeToFile(os.path.join(dirname, out_fn))
def mlfparametrize(self, mlf, inDir=None, outDir=None, source_type='normalized', target_type='decoded', use_empty=False): self.setupDirs(inDir, outDir) foo, new_sen_file = tempfile.mkstemp() self.logger.info("Reading MLF") mlf = MLF.readFromFile(mlf) self.logger.info("Making build directories") self.makeDirs() self.logger.info("Getting text from XMLs") if new_sen_file is None: fn = os.path.join(self.settings['PDT_WORK_DIR'], 'sen_file') else: fn = new_sen_file fw = codecs.open(fn, 'w', 'utf-8') try: for fn in self.getInputXMLs(): dxml = DXML.readFromFile(fn) acts = dxml.getDialogueActs(source_type) idx = 0 for utter in acts: for txt, attrs in utter: if not txt: self.logger.info( "Empty <dialogue_act> in file: %s", fn) key = "*/%s_%05d" % (os.path.splitext( os.path.basename(fn))[0], idx) if key in mlf: line = mlf[key] elif not use_empty: line = [txt + '\n'] else: line = ['\n'] line = '\n'.join(line) fw.write(line) idx += 1 finally: fw.close() self.parametrize(target_type, new_sen_file) os.remove(new_sen_file)
def mlfparametrize(self, mlf, inDir=None, outDir=None, source_type='normalized', target_type='decoded', use_empty=False): self.setupDirs(inDir, outDir) foo, new_sen_file = tempfile.mkstemp() self.logger.info("Reading MLF") mlf = MLF.readFromFile(mlf) self.logger.info("Making build directories") self.makeDirs() self.logger.info("Getting text from XMLs") if new_sen_file is None: fn = os.path.join(self.settings['PDT_WORK_DIR'], 'sen_file') else: fn = new_sen_file fw = codecs.open(fn, 'w', 'utf-8') try: for fn in self.getInputXMLs(): dxml = DXML.readFromFile(fn) acts = dxml.getDialogueActs(source_type) idx = 0 for utter in acts: for txt, attrs in utter: if not txt: self.logger.info("Empty <dialogue_act> in file: %s", fn) key = "*/%s_%05d" % (os.path.splitext(os.path.basename(fn))[0], idx) if key in mlf: line = mlf[key] elif not use_empty: line = [txt + '\n'] else: line = ['\n'] line = '\n'.join(line) fw.write(line) idx += 1 finally: fw.close() self.parametrize(target_type, new_sen_file) os.remove(new_sen_file)
def main( self, model_dir, encoding=None, batch=False, omit_leaves=False, mlf=False, xml_dir=None, ref_mlf=None, skip_empty=False, input_chain=None, batch_size=100, no_underscores=True, force_pdt=False, pdt_dir=None, ): encoding = sys.stdout.encoding if encoding is None: if os.name == "nt": encoding = "cp1250" else: encoding = "iso-8859-2" datasets_fn = pjoin(model_dir, "datasets") datasets_fr = file(datasets_fn, "r") datasets = [] isymMaps = [] for i, line in enumerate(datasets_fr): line = line.strip() datasets.append(line) if line != "off": isymMaps.append(SymMap.readFromFile(pjoin(model_dir, "isym%d.map" % (i + 1,)))) osymMap = SymMap.readFromFile(pjoin(model_dir, "osym.map")) if "signed" in datasets: da_type = "signed" else: da_type = "normalized" if not pdt_dir: pdt_dir = "/opt/PDT-2.0/tools/machine-annotation" if xml_dir: reader = input.MultiReader([xml_dir], input.DXMLReader) if force_pdt and "lemma" in datasets or "pos" in datasets: if os.name == "nt": raise RuntimeError("Datasets 'lemma' and 'pos' are unsupported on Windows") reader = input.PDTReader(pdt_dir, reader, online=not batch) else: reader = input.StdInReader(encoding=encoding, type=da_type) if "lemma" in datasets or "pos" in datasets: if os.name == "nt": raise RuntimeError("Datasets 'lemma' and 'pos' are unsupported on Windows") reader = input.PDTReader(pdt_dir, reader, online=not batch) if input_chain is not None: reader = input.InputChain(input_chain, reader) generator = input.InputGenerator(reader, datasets, datasets[0], noUnderscores=no_underscores) hypMLF = MLF() refMLF = MLF() if not batch: for da_fn, da_id, da_semantics, da_txts in generator.readInputs(): da_empty = not bool(da_semantics.strip()) if da_empty and skip_empty: continue refMLF[da_id] = da_semantics + "\n" dcd = self.parseLine(model_dir, [da_txts], isymMaps, osymMap, omitLeaves=omit_leaves) if dcd: if len(dcd) == 1: hypMLF[da_id] = dcd[0].encode(encoding) + "\n" else: hypMLF[da_id] = ";".join(dcd).encode(encoding) + "\n" else: hypMLF[da_id] = line + "\n" if not mlf: print hypMLF[da_id], else: all_processed = False inputs = generator.readInputs() while not all_processed: da_count = 0 lines = [] ids = [] for da_fn, da_id, da_semantics, da_txts in inputs: da_empty = not bool(da_semantics.strip()) if da_empty and skip_empty: continue refMLF[da_id] = da_semantics + "\n" lines.append(da_txts) ids.append(da_id) da_count += 1 if da_count >= batch_size: break else: all_processed = True dcd = self.parseLine(model_dir, lines, isymMaps, osymMap, omitLeaves=omit_leaves) for da_id, ol in zip(ids, dcd): hypMLF[da_id] = ol.encode(encoding) + "\n" if not mlf: print hypMLF[da_id], if mlf: s = "".join(hypMLF.toLines()) print s if ref_mlf: refMLF.writeToFile(ref_mlf)
def main(self, model_dir, encoding=None, batch=False, omit_leaves=False, mlf=False, xml_dir=None, ref_mlf=None, skip_empty=False, input_chain=None, batch_size=100, no_underscores=True, force_pdt=False, pdt_dir=None): encoding = sys.stdout.encoding if encoding is None: if os.name == 'nt': encoding = 'cp1250' else: encoding = 'iso-8859-2' datasets_fn = pjoin(model_dir, 'datasets') datasets_fr = file(datasets_fn, 'r') datasets = [] isymMaps = [] for i, line in enumerate(datasets_fr): line = line.strip() datasets.append(line) if line != 'off': isymMaps.append( SymMap.readFromFile( pjoin(model_dir, 'isym%d.map' % (i + 1, )))) osymMap = SymMap.readFromFile(pjoin(model_dir, 'osym.map')) if 'signed' in datasets: da_type = 'signed' else: da_type = 'normalized' if not pdt_dir: pdt_dir = '/opt/PDT-2.0/tools/machine-annotation' if xml_dir: reader = input.MultiReader([xml_dir], input.DXMLReader) if force_pdt and 'lemma' in datasets or 'pos' in datasets: if os.name == 'nt': raise RuntimeError( "Datasets 'lemma' and 'pos' are unsupported on Windows" ) reader = input.PDTReader(pdt_dir, reader, online=not batch) else: reader = input.StdInReader(encoding=encoding, type=da_type) if 'lemma' in datasets or 'pos' in datasets: if os.name == 'nt': raise RuntimeError( "Datasets 'lemma' and 'pos' are unsupported on Windows" ) reader = input.PDTReader(pdt_dir, reader, online=not batch) if input_chain is not None: reader = input.InputChain(input_chain, reader) generator = input.InputGenerator(reader, datasets, datasets[0], noUnderscores=no_underscores) hypMLF = MLF() refMLF = MLF() if not batch: for da_fn, da_id, da_semantics, da_txts in generator.readInputs(): da_empty = not bool(da_semantics.strip()) if (da_empty and skip_empty): continue refMLF[da_id] = da_semantics + '\n' dcd = self.parseLine(model_dir, [da_txts], isymMaps, osymMap, omitLeaves=omit_leaves) if dcd: if len(dcd) == 1: hypMLF[da_id] = dcd[0].encode(encoding) + '\n' else: hypMLF[da_id] = ';'.join(dcd).encode(encoding) + '\n' else: hypMLF[da_id] = line + '\n' if not mlf: print hypMLF[da_id], else: all_processed = False inputs = generator.readInputs() while not all_processed: da_count = 0 lines = [] ids = [] for da_fn, da_id, da_semantics, da_txts in inputs: da_empty = not bool(da_semantics.strip()) if (da_empty and skip_empty): continue refMLF[da_id] = da_semantics + '\n' lines.append(da_txts) ids.append(da_id) da_count += 1 if da_count >= batch_size: break else: all_processed = True dcd = self.parseLine(model_dir, lines, isymMaps, osymMap, omitLeaves=omit_leaves) for da_id, ol in zip(ids, dcd): hypMLF[da_id] = ol.encode(encoding) + '\n' if not mlf: print hypMLF[da_id], if mlf: s = ''.join(hypMLF.toLines()) print s if ref_mlf: refMLF.writeToFile(ref_mlf)