def __init__(self,treebank_name=None,file_handler=ConllFileHandler(), transformer="vg", parser="malt", outdir=None, dep_style="ud", pos_style='ud'): #TODO: ouch this is ugly if not outdir: if not treebank_name: self.outdir = config.exp else: self.outdir= config.exp + treebank_name + "/" else: self.outdir = outdir if parser == "malt": self._parser = MaltParser(name=treebank_name) else: raise Exception, "Invalid parser" self.treebank_name = treebank_name self._file_handler = file_handler self._transformer=transformer self.trainfile = "%strain.conll"%self.outdir self.testfile = "%stest_gold.conll"%self.outdir self._dep_style = dep_style self._pos_style = pos_style
class TreebankTransformer(): def __init__(self,treebank_name=None,file_handler=ConllFileHandler(), transformer="vg", parser="malt", outdir=None, dep_style="ud", pos_style='ud'): #TODO: ouch this is ugly if not outdir: if not treebank_name: self.outdir = config.exp else: self.outdir= config.exp + treebank_name + "/" else: self.outdir = outdir if parser == "malt": self._parser = MaltParser(name=treebank_name) else: raise Exception, "Invalid parser" self.treebank_name = treebank_name self._file_handler = file_handler self._transformer=transformer self.trainfile = "%strain.conll"%self.outdir self.testfile = "%stest_gold.conll"%self.outdir self._dep_style = dep_style self._pos_style = pos_style def init_files_for_transformation(self): self.parsed_ms = "%sdev_parsed.ms.conll"%self.outdir self.parsed_ud = "%sdev_parsed.ud.conll"%self.outdir self.transformed_train = "%strain.ms.conll"%self.outdir def transform_parse_detransform(self): self.init_files_for_transformation() self.transform(self.trainfile, self.transformed_train, "transform") self._parser.train(self.transformed_train) self._parser.parse(self.testfile,self.parsed_ms) self.transform(self.parsed_ms, self.parsed_ud, "detransform") def transform_detransform_trainfile(self): self.transformed_train = "%strain.ms.conll"%self.outdir self.transform(self.trainfile, self.transformed_train, "transform") self.back_transf = "%strain_backtransf.conll"%self.outdir self.transform(self.transformed_train, self.back_transf, "detransform") def count_aux(self, infile): """return n of aux n of tokens and n of sentences""" n_aux = 0 n_tokens = 0 dgs_in = self._file_handler.file_to_dg_list(infile) for dg in dgs_in: n_tokens += len(dg) transform = VGtransformer(dg, dep_style=self._dep_style) transform.transform() n_aux += transform.tot_aux return n_aux, n_tokens, len(dgs_in) def count_cop(self, infile): """return n of aux n of tokens and n of sentences""" n_cop = 0 dgs_in = self._file_handler.file_to_dg_list(infile) for dg in dgs_in: if dg.has_cop_deprel(): n_cop += 1 return n_cop, len(dgs_in) def collect_vg_postags(self,infile): aux_pos = {} main_verbs_pos = {} dgs_in = self._file_handler.file_to_dg_list(infile) for dg in dgs_in: transform = VGtransformer(dg, dep_style=self._dep_style) transform.add_vg_pos_information(aux_pos, main_verbs_pos) main_verbs_pos = dict_count_to_freq(main_verbs_pos) aux_pos = dict_count_to_freq(aux_pos) return main_verbs_pos, aux_pos def transform(self, infile, outfile, transformation): dgs_in = self._file_handler.file_to_dg_list(infile) dgs_out = [] for dg in dgs_in: if self._transformer == "vg": transform = VGtransformer(dg, dep_style=self._dep_style,pos_style=self._pos_style) else: raise Exception, "Invalid transformation" if transformation == "transform": transform.transform() elif transformation == "detransform": transform.detransform() elif transformation == "disambig": transform.disambiguate_vg_postags() elif transformation == "ambig": dg.make_verbs_ambiguous(pos_style=self._pos_style) elif transformation == "to_conllx": dg.to_conllx() else: raise Exception, "Invalid transformation" dgs_out.append(dg) self._file_handler.dep_graphs_to_file(outfile, dgs_out)