def __init__(self, path, lang, decoding=None, use_wordrep_tree=False, use_wordrep_rel=False, eval_spec_rel=False, logger=None, ignore_rel=None, lr=False, use_muc=False): self.path = path self.lang = lang self.decoding = decoding self.use_wordrep_tree = use_wordrep_tree self.use_wordrep_rel = use_wordrep_rel self.eval_spec_rel = eval_spec_rel self.use_muc = use_muc if self.decoding is None: print("Decoding method not specified.") if self.use_wordrep_tree or self.use_wordrep_rel: self.decoding = "max-product" else: self.decoding = "viterbi" print("Using default: {}".format(self.decoding)) self.n_states = None self.n_obs = None self.n_sent = None self.n_toks = None self.corpus_file = None self.logger = logger self.n_states, self.n_obs, self.n_sent, self.n_toks, self.corpus_file, self.omit_class_cond, self.omit_emis_cond = \ read_params_from_path(self.path) if self.logger is not None: self.logger.debug("Preparing self.dataset") if self.use_wordrep_tree or self.use_wordrep_rel: lemmas = False if self.lang == "en" else True self.dataset = ConllCorpus("{}".format(self.corpus_file), howbig=self.n_sent, lemmas=lemmas, eval_spec_rels=self.eval_spec_rel, dirname=self.path, lr=lr) self.ignore_rel = self.dataset.r_dict.get_label_id(ignore_rel) if ignore_rel is not None else None if decoding == "posterior_cont_type": self.dataset.train = self.dataset.prepare_trees_gen() # generator else: self.dataset = TextCorpus("{}".format(self.corpus_file), howbig=self.n_sent) if decoding == "posterior_cont_type": self.dataset.prepare_chains() self.ner_corpus = None if self.lang == "nl" and not (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq = self.prepare_seqs_nl(self.decoding) # self.test_seq = self.prepare_seqs_nl(self.decoding) elif self.lang == "nl" and (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq = self.prepare_trees_nl(self.decoding, lr=lr) # self.test_seq = self.prepare_trees_nl(self.decoding) elif self.lang == "en" and not (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_seqs_en(self.decoding) elif self.lang == "en" and (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_trees_en(self.decoding, lr=lr) else: sys.exit("invalid option in PrepareHmmRep") self.dataset = None
def main(self, path): """ :param path: path to dir containing npy and settings files from the experiment """ self.path = path self.ep = np.load("{}/ep.npy".format(self.path)) #get some info from the setttings file with open("{}/settings".format(self.path)) as infile: data_name = None n_sent = None for l in infile: if l.startswith("Name of the corpus file: "): data_name = l.strip().split(" ")[-1] elif l.startswith("Number of sentences: "): n_sent = l.strip().split(" ")[-1] if data_name is None: print("Not able to retrieve the dataset name.") if n_sent is None: print("Not able to retrieve the number of sentences.") self.data_name = data_name self.n_sent = eval(n_sent) if "tree" or "_rel_" in path: if "_en_" in path: self.data = ConllCorpus(self.data_name, howbig=self.n_sent, lemmas=False) elif "_nl_" in path: self.data = ConllCorpus(self.data_name, howbig=self.n_sent) else: self.data = TextCorpus(self.data_name, howbig=self.n_sent) self.prob_thresh = None self.n = None # max n of clusters per w
def __init__(self, path, lang, decoding=None, use_wordrep_tree=False): self.path = path self.lang = lang self.decoding = decoding self.use_wordrep_tree = use_wordrep_tree if self.decoding is None: print("Decoding method not specified.") if self.use_wordrep_tree: self.decoding = "max-product" else: self.decoding = "viterbi" print("Using default: {}".format(self.decoding)) self.n_states = None self.n_obs = None self.n_sent = None self.n_toks = None self.corpus_file = None self.n_states, self.n_obs, self.n_sent, self.n_toks, self.corpus_file = \ self.read_params_from_path() if self.use_wordrep_tree: if self.lang == "en": self.dataset = ConllCorpus("{}".format(self.corpus_file), howbig=self.n_sent, lemmas=False) elif self.lang == "nl": self.dataset = ConllCorpus("{}".format(self.corpus_file), howbig=self.n_sent) else: self.dataset = TextCorpus("{}".format(self.corpus_file), howbig=self.n_sent) self.ner_corpus = None if self.lang == "nl" and not self.use_wordrep_tree: self.dev_seq, self.test_seq = self.prepare_seqs_nl_dbg( self.decoding) # self.test_seq = self.prepare_seqs_nl(self.decoding) elif self.lang == "nl" and self.use_wordrep_tree: self.train_seq, self.dev_seq, self.test_seq = self.prepare_trees_nl( self.decoding) # self.test_seq = self.prepare_trees_nl(self.decoding) elif self.lang == "en" and not self.use_wordrep_tree: self.dev_seq = self.prepare_seqs_en_dbg(self.decoding) elif self.lang == "en" and self.use_wordrep_tree: self.dev_seq = self.prepare_trees_en_dbg(self.decoding)
def __init__(self, path, lang, decoding=None, use_wordrep_tree=False, use_wordrep_rel=False, eval_spec_rel=False, logger=None, ignore_rel=None, lr=False, use_muc=False): self.path = path self.lang = lang self.decoding = decoding self.use_wordrep_tree = use_wordrep_tree self.use_wordrep_rel = use_wordrep_rel self.eval_spec_rel = eval_spec_rel self.use_muc = use_muc if self.decoding is None: print("Decoding method not specified.") if self.use_wordrep_tree or self.use_wordrep_rel: self.decoding = "max-product" else: self.decoding = "viterbi" print("Using default: {}".format(self.decoding)) self.n_states = None self.n_obs = None self.n_sent = None self.n_toks = None self.corpus_file = None self.logger = logger self.n_states, self.n_obs, self.n_sent, self.n_toks, self.corpus_file, self.omit_class_cond, self.omit_emis_cond = \ read_params_from_path(self.path) if self.logger is not None: self.logger.debug("Preparing self.dataset") if self.use_wordrep_tree or self.use_wordrep_rel: lemmas = False if self.lang == "en" else True self.dataset = ConllCorpus("{}".format(self.corpus_file), howbig=self.n_sent, lemmas=lemmas, eval_spec_rels=self.eval_spec_rel, dirname=self.path, lr=lr) self.ignore_rel = self.dataset.r_dict.get_label_id( ignore_rel) if ignore_rel is not None else None if decoding == "posterior_cont_type": self.dataset.train = self.dataset.prepare_trees_gen( ) # generator else: self.dataset = TextCorpus("{}".format(self.corpus_file), howbig=self.n_sent) if decoding == "posterior_cont_type": self.dataset.prepare_chains() self.ner_corpus = None if self.lang == "nl" and not (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq = self.prepare_seqs_nl( self.decoding) # self.test_seq = self.prepare_seqs_nl(self.decoding) elif self.lang == "nl" and (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq = self.prepare_trees_nl( self.decoding, lr=lr) # self.test_seq = self.prepare_trees_nl(self.decoding) elif self.lang == "en" and not (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_seqs_en( self.decoding) elif self.lang == "en" and (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_trees_en( self.decoding, lr=lr) else: sys.exit("invalid option in PrepareHmmRep") self.dataset = None
class PrepareHmmRep(): """ Applying hmm-based representations to the evaluation dataset. This includes decoding. """ def __init__(self, path, lang, decoding=None, use_wordrep_tree=False, use_wordrep_rel=False, eval_spec_rel=False, logger=None, ignore_rel=None, lr=False, use_muc=False): self.path = path self.lang = lang self.decoding = decoding self.use_wordrep_tree = use_wordrep_tree self.use_wordrep_rel = use_wordrep_rel self.eval_spec_rel = eval_spec_rel self.use_muc = use_muc if self.decoding is None: print("Decoding method not specified.") if self.use_wordrep_tree or self.use_wordrep_rel: self.decoding = "max-product" else: self.decoding = "viterbi" print("Using default: {}".format(self.decoding)) self.n_states = None self.n_obs = None self.n_sent = None self.n_toks = None self.corpus_file = None self.logger = logger self.n_states, self.n_obs, self.n_sent, self.n_toks, self.corpus_file, self.omit_class_cond, self.omit_emis_cond = \ read_params_from_path(self.path) if self.logger is not None: self.logger.debug("Preparing self.dataset") if self.use_wordrep_tree or self.use_wordrep_rel: lemmas = False if self.lang == "en" else True self.dataset = ConllCorpus("{}".format(self.corpus_file), howbig=self.n_sent, lemmas=lemmas, eval_spec_rels=self.eval_spec_rel, dirname=self.path, lr=lr) self.ignore_rel = self.dataset.r_dict.get_label_id( ignore_rel) if ignore_rel is not None else None if decoding == "posterior_cont_type": self.dataset.train = self.dataset.prepare_trees_gen( ) # generator else: self.dataset = TextCorpus("{}".format(self.corpus_file), howbig=self.n_sent) if decoding == "posterior_cont_type": self.dataset.prepare_chains() self.ner_corpus = None if self.lang == "nl" and not (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq = self.prepare_seqs_nl( self.decoding) # self.test_seq = self.prepare_seqs_nl(self.decoding) elif self.lang == "nl" and (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq = self.prepare_trees_nl( self.decoding, lr=lr) # self.test_seq = self.prepare_trees_nl(self.decoding) elif self.lang == "en" and not (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_seqs_en( self.decoding) elif self.lang == "en" and (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_trees_en( self.decoding, lr=lr) else: sys.exit("invalid option in PrepareHmmRep") self.dataset = None def prepare_seqs_nl(self, decoding="viterbi"): params_fixed = (np.load("{}/ip.npy".format(self.path)), np.load("{}/tp.npy".format(self.path)), np.load("{}/fp.npy".format(self.path)), np.load("{}/ep.npy".format(self.path))) h = HMM(self.n_states, self.n_obs, params=params_fixed, writeout=False, dirname=self.path) self.ner_corpus = Conll2002NerCorpus(self.dataset.x_dict, eval_spec_rel=self.eval_spec_rel, dirname=self.path) train_seq = self.ner_corpus.read_sequence_list_conll(ned_train) dev_seq = self.ner_corpus.read_sequence_list_conll(ned_dev) test_seq = self.ner_corpus.read_sequence_list_conll(ned_test) decoder = None type_decoder = None if decoding == "viterbi": decoder = h.viterbi_decode_corpus elif decoding == "max_emission": decoder = h.max_emission_decode_corpus elif decoding == "posterior": decoder = h.posterior_decode_corpus elif decoding == "posterior_cont": decoder = h.posterior_cont_decode_corpus elif decoding == "posterior_cont_type": type_decoder = h.posterior_cont_type_decode_corpus else: print("Decoder not defined, using Viterbi.") decoder = h.viterbi_decode_corpus print( "Decoding word representations on train. This may take a while...") type_decoder( train_seq, self.dataset, self.logger) if type_decoder is not None else decoder(train_seq) print("Decoding word representations on dev.") type_decoder( dev_seq, self.dataset, self.logger) if type_decoder is not None else decoder(dev_seq) print("Decoding word representations on test.") type_decoder( test_seq, self.dataset, self.logger) if type_decoder is not None else decoder(test_seq) return train_seq, dev_seq, test_seq def prepare_seqs_en(self, decoding="viterbi"): params_fixed = (np.load("{}/ip.npy".format(self.path)), np.load("{}/tp.npy".format(self.path)), np.load("{}/fp.npy".format(self.path)), np.load("{}/ep.npy".format(self.path))) h = HMM(self.n_states, self.n_obs, params=params_fixed, writeout=False, dirname=self.path) self.ner_corpus = Conll2003NerCorpus(self.dataset.x_dict) train_seq = self.ner_corpus.read_sequence_list_conll(eng_train) dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev) test_seq = self.ner_corpus.read_sequence_list_conll(eng_test) muc_seq = self.ner_corpus.read_sequence_list_conll( muc_test) if self.use_muc else None decoder = None type_decoder = None if decoding == "viterbi": decoder = h.viterbi_decode_corpus elif decoding == "max_emission": decoder = h.max_emission_decode_corpus elif decoding == "posterior": decoder = h.posterior_decode_corpus elif decoding == "posterior_cont": decoder = h.posterior_cont_decode_corpus elif decoding == "posterior_cont_type": type_decoder = h.posterior_cont_type_decode_corpus else: print("Decoder not defined correctly, using Viterbi.") decoder = h.viterbi_decode_corpus print( "Decoding word representations on train. This may take a while...") type_decoder( train_seq, self.dataset, self.logger) if type_decoder is not None else decoder(train_seq) print("Decoding word representations on dev.") type_decoder( dev_seq, self.dataset, self.logger) if type_decoder is not None else decoder(dev_seq) print("Decoding word representations on test.") type_decoder( test_seq, self.dataset, self.logger) if type_decoder is not None else decoder(test_seq) if self.use_muc: print("Decoding word representations on MUC.") type_decoder( muc_seq, self.dataset, self.logger) if type_decoder is not None else decoder(muc_seq) return train_seq, dev_seq, test_seq, muc_seq def prepare_trees_nl(self, decoding="max-product", lr=False): params_fixed = (np.load("{}ip.npy".format(self.path)), np.load("{}tp.npy".format(self.path)), np.load("{}fp.npy".format(self.path)), np.load("{}ep.npy".format(self.path))) if self.use_wordrep_rel: h = HMRTM(self.n_states, self.n_obs, R=len(self.dataset.r_dict), params=params_fixed, writeout=False, dirname=self.path, omit_class_cond=self.omit_class_cond, omit_emis_cond=self.omit_emis_cond) else: h = HMTM(self.n_states, self.n_obs, params=params_fixed, writeout=False, dirname=self.path) # h.dirname = self.path self.logger.debug("Creating self.ner_corpus") self.ner_corpus = Conll2002NerCorpus(self.dataset.x_dict, eval_spec_rel=self.eval_spec_rel, dirname=self.path, lr=lr, use_wordrep_tree=True) self.logger.debug("Reading ner data from self.ner_corpus") train_seq = self.ner_corpus.read_sequence_list_conll( ned_train, ned_train_parsed, ned_train_parsed_files_path) dev_seq = self.ner_corpus.read_sequence_list_conll( ned_dev, ned_dev_parsed, ned_dev_parsed_files_path) test_seq = self.ner_corpus.read_sequence_list_conll( ned_test, ned_test_parsed, ned_test_parsed_files_path) decoder = None type_decoder = None if decoding == "max-product": decoder = h.max_product_decode_corpus # elif decoding == "max_emission": # decoder = h.max_emission_decode_corpus elif decoding == "posterior": decoder = h.posterior_decode_corpus elif decoding == "posterior_cont": decoder = h.posterior_cont_decode_corpus elif decoding == "posterior_cont_type": type_decoder = h.posterior_cont_type_decode_corpus else: print("Decoder not defined, using Max-product message passing.") decoder = h.max_product_decode_corpus self.logger.debug("Decoding.") print( "Decoding word representations on train. This may take a while...") type_decoder(train_seq, self.dataset, self.logger) if type_decoder is not None else decoder( train_seq, self.ignore_rel) print("Decoding word representations on dev.") type_decoder(dev_seq, self.dataset, self.logger) if type_decoder is not None else decoder( dev_seq, self.ignore_rel) print("Decoding word representations on test.") type_decoder(test_seq, self.dataset, self.logger) if type_decoder is not None else decoder( test_seq, self.ignore_rel) return train_seq, dev_seq, test_seq def prepare_trees_en(self, decoding="max-product", lr=False): params_fixed = (np.load("{}ip.npy".format(self.path)), np.load("{}tp.npy".format(self.path)), np.load("{}fp.npy".format(self.path)), np.load("{}ep.npy".format(self.path))) if self.use_wordrep_rel: h = HMRTM(self.n_states, self.n_obs, R=len(self.dataset.r_dict), params=params_fixed, writeout=False, dirname=self.path, omit_class_cond=self.omit_class_cond, omit_emis_cond=self.omit_emis_cond) else: h = HMTM(self.n_states, self.n_obs, params=params_fixed, writeout=False, dirname=self.path) self.logger.debug("Reading ner data from self.ner_corpus") self.ner_corpus = Conll2003NerCorpus(self.dataset.x_dict, eval_spec_rel=self.eval_spec_rel, dirname=self.path, lr=lr, use_wordrep_tree=True) train_seq = self.ner_corpus.read_sequence_list_conll( eng_train, eng_train_parsed) dev_seq = self.ner_corpus.read_sequence_list_conll( eng_dev, eng_dev_parsed) test_seq = self.ner_corpus.read_sequence_list_conll( eng_test, eng_test_parsed) muc_seq = self.ner_corpus.read_sequence_list_conll( muc_test, muc_test_parsed) if self.use_muc else None # return train_seq, dev_seq, test_seq decoder = None type_decoder = None if decoding == "max-product": decoder = h.max_product_decode_corpus elif decoding == "posterior": decoder = h.posterior_decode_corpus elif decoding == "posterior_cont": decoder = h.posterior_cont_decode_corpus elif decoding == "posterior_cont_type": type_decoder = h.posterior_cont_type_decode_corpus else: print("Decoder not defined, using Max-product message passing.") decoder = h.max_product_decode_corpus print( "Decoding word representations on train. This may take a while...") type_decoder(train_seq, self.dataset, self.logger) if type_decoder is not None else decoder( train_seq, self.ignore_rel) print("Decoding word representations on dev.") type_decoder(dev_seq, self.dataset, self.logger) if type_decoder is not None else decoder( dev_seq, self.ignore_rel) print("Decoding word representations on test.") type_decoder(test_seq, self.dataset, self.logger) if type_decoder is not None else decoder( test_seq, self.ignore_rel) if self.use_muc: print("Decoding word representations on MUC.") type_decoder(muc_seq, self.dataset, self.logger) if type_decoder is not None else decoder( muc_seq, self.ignore_rel) return train_seq, dev_seq, test_seq, muc_seq
dataset = ConllCorpus(args.dataset, howbig=n_sent, lemmas=lemmas, spec_rels=args.rel_spec_nl, dirname=dirname, lr=args.lr) else: dataset = ConllCorpus(args.dataset, howbig=n_sent, lemmas=lemmas, spec_rels=None, dirname=dirname, lr=args.lr) n_rels = len(dataset.r_dict) else: dataset = TextCorpus(args.dataset, howbig=n_sent) dataset.prepare_chains() n_obs = len(dataset.x_dict) writeout = args.writeout if args.rel or args.lr: model = HMRTM elif args.tree: model = HMTM else: model = HMM if args.params is not None: params_fixed_path = args.params if args.params_trained:
class PrepareHmmRep(): """ Applying hmm-based representations to the evaluation dataset. This includes decoding. """ def __init__(self, path, lang, decoding=None, use_wordrep_tree=False, use_wordrep_rel=False, eval_spec_rel=False, logger=None, ignore_rel=None, lr=False, use_muc=False): self.path = path self.lang = lang self.decoding = decoding self.use_wordrep_tree = use_wordrep_tree self.use_wordrep_rel = use_wordrep_rel self.eval_spec_rel = eval_spec_rel self.use_muc = use_muc if self.decoding is None: print("Decoding method not specified.") if self.use_wordrep_tree or self.use_wordrep_rel: self.decoding = "max-product" else: self.decoding = "viterbi" print("Using default: {}".format(self.decoding)) self.n_states = None self.n_obs = None self.n_sent = None self.n_toks = None self.corpus_file = None self.logger = logger self.n_states, self.n_obs, self.n_sent, self.n_toks, self.corpus_file, self.omit_class_cond, self.omit_emis_cond = \ read_params_from_path(self.path) if self.logger is not None: self.logger.debug("Preparing self.dataset") if self.use_wordrep_tree or self.use_wordrep_rel: lemmas = False if self.lang == "en" else True self.dataset = ConllCorpus("{}".format(self.corpus_file), howbig=self.n_sent, lemmas=lemmas, eval_spec_rels=self.eval_spec_rel, dirname=self.path, lr=lr) self.ignore_rel = self.dataset.r_dict.get_label_id(ignore_rel) if ignore_rel is not None else None if decoding == "posterior_cont_type": self.dataset.train = self.dataset.prepare_trees_gen() # generator else: self.dataset = TextCorpus("{}".format(self.corpus_file), howbig=self.n_sent) if decoding == "posterior_cont_type": self.dataset.prepare_chains() self.ner_corpus = None if self.lang == "nl" and not (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq = self.prepare_seqs_nl(self.decoding) # self.test_seq = self.prepare_seqs_nl(self.decoding) elif self.lang == "nl" and (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq = self.prepare_trees_nl(self.decoding, lr=lr) # self.test_seq = self.prepare_trees_nl(self.decoding) elif self.lang == "en" and not (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_seqs_en(self.decoding) elif self.lang == "en" and (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_trees_en(self.decoding, lr=lr) else: sys.exit("invalid option in PrepareHmmRep") self.dataset = None def prepare_seqs_nl(self, decoding="viterbi"): params_fixed = (np.load("{}/ip.npy".format(self.path)), np.load("{}/tp.npy".format(self.path)), np.load("{}/fp.npy".format(self.path)), np.load("{}/ep.npy".format(self.path))) h = HMM(self.n_states, self.n_obs, params=params_fixed, writeout=False, dirname=self.path) self.ner_corpus = Conll2002NerCorpus(self.dataset.x_dict, eval_spec_rel=self.eval_spec_rel, dirname=self.path) train_seq = self.ner_corpus.read_sequence_list_conll(ned_train) dev_seq = self.ner_corpus.read_sequence_list_conll(ned_dev) test_seq = self.ner_corpus.read_sequence_list_conll(ned_test) decoder = None type_decoder = None if decoding == "viterbi": decoder = h.viterbi_decode_corpus elif decoding == "max_emission": decoder = h.max_emission_decode_corpus elif decoding == "posterior": decoder = h.posterior_decode_corpus elif decoding == "posterior_cont": decoder = h.posterior_cont_decode_corpus elif decoding == "posterior_cont_type": type_decoder = h.posterior_cont_type_decode_corpus else: print("Decoder not defined, using Viterbi.") decoder = h.viterbi_decode_corpus print("Decoding word representations on train. This may take a while...") type_decoder(train_seq, self.dataset, self.logger) if type_decoder is not None else decoder(train_seq) print("Decoding word representations on dev.") type_decoder(dev_seq, self.dataset, self.logger) if type_decoder is not None else decoder(dev_seq) print("Decoding word representations on test.") type_decoder(test_seq, self.dataset, self.logger) if type_decoder is not None else decoder(test_seq) return train_seq, dev_seq, test_seq def prepare_seqs_en(self, decoding="viterbi"): params_fixed = (np.load("{}/ip.npy".format(self.path)), np.load("{}/tp.npy".format(self.path)), np.load("{}/fp.npy".format(self.path)), np.load("{}/ep.npy".format(self.path))) h = HMM(self.n_states, self.n_obs, params=params_fixed, writeout=False, dirname=self.path) self.ner_corpus = Conll2003NerCorpus(self.dataset.x_dict) train_seq = self.ner_corpus.read_sequence_list_conll(eng_train) dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev) test_seq = self.ner_corpus.read_sequence_list_conll(eng_test) muc_seq = self.ner_corpus.read_sequence_list_conll(muc_test) if self.use_muc else None decoder = None type_decoder = None if decoding == "viterbi": decoder = h.viterbi_decode_corpus elif decoding == "max_emission": decoder = h.max_emission_decode_corpus elif decoding == "posterior": decoder = h.posterior_decode_corpus elif decoding == "posterior_cont": decoder = h.posterior_cont_decode_corpus elif decoding == "posterior_cont_type": type_decoder = h.posterior_cont_type_decode_corpus else: print("Decoder not defined correctly, using Viterbi.") decoder = h.viterbi_decode_corpus print("Decoding word representations on train. This may take a while...") type_decoder(train_seq, self.dataset, self.logger) if type_decoder is not None else decoder(train_seq) print("Decoding word representations on dev.") type_decoder(dev_seq, self.dataset, self.logger) if type_decoder is not None else decoder(dev_seq) print("Decoding word representations on test.") type_decoder(test_seq, self.dataset, self.logger) if type_decoder is not None else decoder(test_seq) if self.use_muc: print("Decoding word representations on MUC.") type_decoder(muc_seq, self.dataset, self.logger) if type_decoder is not None else decoder(muc_seq) return train_seq, dev_seq, test_seq, muc_seq def prepare_trees_nl(self, decoding="max-product", lr=False): params_fixed = (np.load("{}ip.npy".format(self.path)), np.load("{}tp.npy".format(self.path)), np.load("{}fp.npy".format(self.path)), np.load("{}ep.npy".format(self.path))) if self.use_wordrep_rel: h = HMRTM(self.n_states, self.n_obs, R=len(self.dataset.r_dict), params=params_fixed, writeout=False, dirname=self.path, omit_class_cond=self.omit_class_cond, omit_emis_cond=self.omit_emis_cond) else: h = HMTM(self.n_states, self.n_obs, params=params_fixed, writeout=False, dirname=self.path) # h.dirname = self.path self.logger.debug("Creating self.ner_corpus") self.ner_corpus = Conll2002NerCorpus(self.dataset.x_dict, eval_spec_rel=self.eval_spec_rel, dirname=self.path, lr=lr, use_wordrep_tree=True) self.logger.debug("Reading ner data from self.ner_corpus") train_seq = self.ner_corpus.read_sequence_list_conll(ned_train, ned_train_parsed, ned_train_parsed_files_path) dev_seq = self.ner_corpus.read_sequence_list_conll(ned_dev, ned_dev_parsed, ned_dev_parsed_files_path) test_seq = self.ner_corpus.read_sequence_list_conll(ned_test, ned_test_parsed, ned_test_parsed_files_path) decoder = None type_decoder = None if decoding == "max-product": decoder = h.max_product_decode_corpus # elif decoding == "max_emission": # decoder = h.max_emission_decode_corpus elif decoding == "posterior": decoder = h.posterior_decode_corpus elif decoding == "posterior_cont": decoder = h.posterior_cont_decode_corpus elif decoding == "posterior_cont_type": type_decoder = h.posterior_cont_type_decode_corpus else: print("Decoder not defined, using Max-product message passing.") decoder = h.max_product_decode_corpus self.logger.debug("Decoding.") print("Decoding word representations on train. This may take a while...") type_decoder(train_seq, self.dataset, self.logger) if type_decoder is not None else decoder(train_seq, self.ignore_rel) print("Decoding word representations on dev.") type_decoder(dev_seq, self.dataset, self.logger) if type_decoder is not None else decoder(dev_seq, self.ignore_rel) print("Decoding word representations on test.") type_decoder(test_seq, self.dataset, self.logger) if type_decoder is not None else decoder(test_seq, self.ignore_rel) return train_seq, dev_seq, test_seq def prepare_trees_en(self, decoding="max-product", lr=False): params_fixed = (np.load("{}ip.npy".format(self.path)), np.load("{}tp.npy".format(self.path)), np.load("{}fp.npy".format(self.path)), np.load("{}ep.npy".format(self.path))) if self.use_wordrep_rel: h = HMRTM(self.n_states, self.n_obs, R=len(self.dataset.r_dict), params=params_fixed, writeout=False, dirname=self.path, omit_class_cond=self.omit_class_cond, omit_emis_cond=self.omit_emis_cond) else: h = HMTM(self.n_states, self.n_obs, params=params_fixed, writeout=False, dirname=self.path) self.logger.debug("Reading ner data from self.ner_corpus") self.ner_corpus = Conll2003NerCorpus(self.dataset.x_dict, eval_spec_rel=self.eval_spec_rel, dirname=self.path, lr=lr, use_wordrep_tree=True) train_seq = self.ner_corpus.read_sequence_list_conll(eng_train, eng_train_parsed) dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev, eng_dev_parsed) test_seq = self.ner_corpus.read_sequence_list_conll(eng_test, eng_test_parsed) muc_seq = self.ner_corpus.read_sequence_list_conll(muc_test, muc_test_parsed) if self.use_muc else None # return train_seq, dev_seq, test_seq decoder = None type_decoder = None if decoding == "max-product": decoder = h.max_product_decode_corpus elif decoding == "posterior": decoder = h.posterior_decode_corpus elif decoding == "posterior_cont": decoder = h.posterior_cont_decode_corpus elif decoding == "posterior_cont_type": type_decoder = h.posterior_cont_type_decode_corpus else: print("Decoder not defined, using Max-product message passing.") decoder = h.max_product_decode_corpus print("Decoding word representations on train. This may take a while...") type_decoder(train_seq, self.dataset, self.logger) if type_decoder is not None else decoder(train_seq, self.ignore_rel) print("Decoding word representations on dev.") type_decoder(dev_seq, self.dataset, self.logger) if type_decoder is not None else decoder(dev_seq, self.ignore_rel) print("Decoding word representations on test.") type_decoder(test_seq, self.dataset, self.logger) if type_decoder is not None else decoder(test_seq, self.ignore_rel) if self.use_muc: print("Decoding word representations on MUC.") type_decoder(muc_seq, self.dataset, self.logger) if type_decoder is not None else decoder(muc_seq, self.ignore_rel) return train_seq, dev_seq, test_seq, muc_seq
def create_vocab(self, dataset): d = TextCorpus(dataset, howbig=1e10) return d.x_dict