Exemplo n.º 1
0
    def main(self, path):
        """

        :param path: path to dir containing npy and settings files from the experiment
        """
        self.path = path
        self.ep = np.load("{}/ep.npy".format(self.path))
        #get some info from the setttings file
        with open("{}/settings".format(self.path)) as infile:
            data_name = None
            n_sent = None
            for l in infile:
                if l.startswith("Name of the corpus file: "):
                    data_name = l.strip().split(" ")[-1]
                elif l.startswith("Number of sentences: "):
                    n_sent = l.strip().split(" ")[-1]
            if data_name is None:
                print("Not able to retrieve the dataset name.")
            if n_sent is None:
                print("Not able to retrieve the number of sentences.")

        self.data_name = data_name
        self.n_sent = eval(n_sent)
        if "tree" or "_rel_" in path:
            if "_en_" in path:
                self.data = ConllCorpus(self.data_name,
                                        howbig=self.n_sent,
                                        lemmas=False)
            elif "_nl_" in path:
                self.data = ConllCorpus(self.data_name, howbig=self.n_sent)
        else:
            self.data = TextCorpus(self.data_name, howbig=self.n_sent)

        self.prob_thresh = None
        self.n = None  # max n of clusters per w
Exemplo n.º 2
0
 def get_tree(n_inst):
     trees = ConllCorpus(file_parsed, howbig=1000000, lemmas=False, eval_spec_rels=self.eval_spec_rel,
                         dirname=self.dirname, lr=self.lr)
     trees.prepare_trees()
     # extend instances with trees
     assert len(trees.train) == n_inst, "Number of parses not equal to number of classification instances."
     c_append = 0
     for i in range(n_inst):
         # we have a parse:
         inst = self.normalize_tree(trees.train[c_append], trees.x_dict, c_append)
         c_append += 1
         # we don't have a parse:
         yield inst
Exemplo n.º 3
0
    def __init__(self, path, lang, decoding=None, use_wordrep_tree=False):

        self.path = path
        self.lang = lang
        self.decoding = decoding
        self.use_wordrep_tree = use_wordrep_tree
        if self.decoding is None:
            print("Decoding method not specified.")
            if self.use_wordrep_tree:
                self.decoding = "max-product"
            else:
                self.decoding = "viterbi"
        print("Using default: {}".format(self.decoding))
        self.n_states = None
        self.n_obs = None
        self.n_sent = None
        self.n_toks = None
        self.corpus_file = None

        self.n_states, self.n_obs, self.n_sent, self.n_toks, self.corpus_file = \
            self.read_params_from_path()

        if self.use_wordrep_tree:
            if self.lang == "en":
                self.dataset = ConllCorpus("{}".format(self.corpus_file),
                                           howbig=self.n_sent,
                                           lemmas=False)
            elif self.lang == "nl":
                self.dataset = ConllCorpus("{}".format(self.corpus_file),
                                           howbig=self.n_sent)
        else:
            self.dataset = TextCorpus("{}".format(self.corpus_file),
                                      howbig=self.n_sent)
        self.ner_corpus = None

        if self.lang == "nl" and not self.use_wordrep_tree:
            self.dev_seq, self.test_seq = self.prepare_seqs_nl_dbg(
                self.decoding)
            # self.test_seq = self.prepare_seqs_nl(self.decoding)
        elif self.lang == "nl" and self.use_wordrep_tree:
            self.train_seq, self.dev_seq, self.test_seq = self.prepare_trees_nl(
                self.decoding)
            # self.test_seq = self.prepare_trees_nl(self.decoding)
        elif self.lang == "en" and not self.use_wordrep_tree:
            self.dev_seq = self.prepare_seqs_en_dbg(self.decoding)

        elif self.lang == "en" and self.use_wordrep_tree:
            self.dev_seq = self.prepare_trees_en_dbg(self.decoding)
Exemplo n.º 4
0
 def get_tree(n_inst):
     trees = ConllCorpus(file_parsed, howbig=1000000, lemmas=True, eval_spec_rels=self.eval_spec_rel,
                         dirname=self.dirname, lr=self.lr)
     trees.prepare_trees()
     # not every instance has a corresponding tree due to errors in parsing
     conll_idx = ConllFilesIndex(files_parsed_path)
     conll_idx.create_ids_set()
     # extend instances with trees
     c_append = 0
     for i in range(n_inst):
         # we have a parse:
         if i + 1 in conll_idx.fileids:
             inst = self.normalize_tree(trees.train[c_append], trees.x_dict, c_append)
             c_append += 1
         # we don't have a parse:
         else:
             inst = None
         yield inst
Exemplo n.º 5
0
 def get_tree(n_inst):
     trees = ConllCorpus(file_parsed,
                         howbig=1000000,
                         lemmas=False,
                         eval_spec_rels=self.eval_spec_rel,
                         dirname=self.dirname,
                         lr=self.lr)
     trees.prepare_trees()
     # extend instances with trees
     assert len(
         trees.train
     ) == n_inst, "Number of parses not equal to number of classification instances."
     c_append = 0
     for i in range(n_inst):
         # we have a parse:
         inst = self.normalize_tree(trees.train[c_append], trees.x_dict,
                                    c_append)
         c_append += 1
         # we don't have a parse:
         yield inst
Exemplo n.º 6
0
 def get_tree(n_inst):
     trees = ConllCorpus(file_parsed,
                         howbig=1000000,
                         lemmas=True,
                         eval_spec_rels=self.eval_spec_rel,
                         dirname=self.dirname,
                         lr=self.lr)
     trees.prepare_trees()
     self.tree_vocab = trees.x_dict
     # not every instance has a corresponding tree due to errors in parsing
     conll_idx = ConllFilesIndex(files_parsed_path)
     conll_idx.create_ids_set()
     # extend instances with trees
     c_append = 0
     for i in range(n_inst):
         # we have a parse:
         if i + 1 in conll_idx.fileids:
             inst = trees.train[c_append]
             c_append += 1
         # we don't have a parse:
         else:
             inst = None
         yield inst
Exemplo n.º 7
0
import numpy as np

from eval.ner.PrepareHmmRep import read_params_from_path
from hmrtm import HMRTM
from readers.conll_corpus import ConllCorpus

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-rep", "--rep_path", help="directory containing (hmm) word representations files")
    parser.add_argument("--use_lemmas", action='store_true', default=False, help="")
    args = parser.parse_args()

    path = args.rep_path
    posttype_f = "{}posttype_cumul.npy".format(path)
    n_states, n_obs, n_sent, n_toks, corpus_file, omit_class_cond, omit_emis_cond = read_params_from_path(path)
    lemmas = args.use_lemmas
    eval_spec_rel = True
    lr = False

    params_fixed = (np.load("{}ip.npy".format(path)),
                    np.load("{}tp.npy".format(path)),
                    np.load("{}fp.npy".format(path)),
                    np.load("{}ep.npy".format(path)))

    dataset = ConllCorpus("{}".format(corpus_file), howbig=n_sent, lemmas=lemmas, eval_spec_rels=eval_spec_rel,
                          dirname=path, lr=lr)
    dataset.train = dataset.prepare_trees_gen()  # generator
    h = HMRTM(n_states, n_obs, R=len(dataset.r_dict), params=params_fixed, writeout=False, dirname=path,
              omit_class_cond=omit_class_cond, omit_emis_cond=omit_emis_cond)

    h.obtain_posttypes_cumul(posttype_f, dataset, n_types=h.M, logger=None)
Exemplo n.º 8
0
    # obtain model parameters
    n_states, n_obs, _, _, _, omit_class_cond, omit_emis_cond = read_params_from_path(path)
    lemmas = args.use_lemmas
    eval_spec_rel = args.synfunc
    lr = False

    # load model
    params_fixed = (np.load("{}ip.npy".format(path)),
                    np.load("{}tp.npy".format(path)),
                    np.load("{}fp.npy".format(path)),
                    np.load("{}ep.npy".format(path)))


    # prepare sents for decoding
    sents = ConllCorpus(infile, howbig=1000000, lemmas=lemmas, eval_spec_rels=eval_spec_rel, dirname=path, lr=lr)
    sents.prepare_trees()

    h = HMRTM(n_states, n_obs, R=len(sents.r_dict), params=params_fixed, writeout=False, dirname=path,
              omit_class_cond=omit_class_cond, omit_emis_cond=omit_emis_cond) if eval_spec_rel else \
        HMTM(n_states, n_obs, params=params_fixed, writeout=False, dirname=path)

    with open(args.outfile, "w") as out:
        for tree in sents.train:
            # obtain posteriors for all nodes
            node_to_rep = h.posterior_decode(tree, cont=True)
            # get words
            for node in tree.get_nonroots():
                out.write(
                    "{} {}\n".format(sents.x_dict.get_label_name(node.name), nparr_to_str(node_to_rep[node.index])))
            out.write("\n")
Exemplo n.º 9
0
    def __init__(self,
                 path,
                 lang,
                 decoding=None,
                 use_wordrep_tree=False,
                 use_wordrep_rel=False,
                 eval_spec_rel=False,
                 logger=None,
                 ignore_rel=None,
                 lr=False,
                 use_muc=False):

        self.path = path
        self.lang = lang
        self.decoding = decoding
        self.use_wordrep_tree = use_wordrep_tree
        self.use_wordrep_rel = use_wordrep_rel
        self.eval_spec_rel = eval_spec_rel
        self.use_muc = use_muc
        if self.decoding is None:
            print("Decoding method not specified.")
            if self.use_wordrep_tree or self.use_wordrep_rel:
                self.decoding = "max-product"
            else:
                self.decoding = "viterbi"
        print("Using default: {}".format(self.decoding))
        self.n_states = None
        self.n_obs = None
        self.n_sent = None
        self.n_toks = None
        self.corpus_file = None
        self.logger = logger
        self.n_states, self.n_obs, self.n_sent, self.n_toks, self.corpus_file, self.omit_class_cond, self.omit_emis_cond = \
            read_params_from_path(self.path)
        if self.logger is not None:
            self.logger.debug("Preparing self.dataset")
        if self.use_wordrep_tree or self.use_wordrep_rel:
            lemmas = False if self.lang == "en" else True
            self.dataset = ConllCorpus("{}".format(self.corpus_file),
                                       howbig=self.n_sent,
                                       lemmas=lemmas,
                                       eval_spec_rels=self.eval_spec_rel,
                                       dirname=self.path,
                                       lr=lr)
            self.ignore_rel = self.dataset.r_dict.get_label_id(
                ignore_rel) if ignore_rel is not None else None
            if decoding == "posterior_cont_type":
                self.dataset.train = self.dataset.prepare_trees_gen(
                )  # generator
        else:
            self.dataset = TextCorpus("{}".format(self.corpus_file),
                                      howbig=self.n_sent)
            if decoding == "posterior_cont_type":
                self.dataset.prepare_chains()

        self.ner_corpus = None

        if self.lang == "nl" and not (self.use_wordrep_tree
                                      or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq = self.prepare_seqs_nl(
                self.decoding)
            # self.test_seq = self.prepare_seqs_nl(self.decoding)
        elif self.lang == "nl" and (self.use_wordrep_tree
                                    or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq = self.prepare_trees_nl(
                self.decoding, lr=lr)
            # self.test_seq = self.prepare_trees_nl(self.decoding)
        elif self.lang == "en" and not (self.use_wordrep_tree
                                        or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_seqs_en(
                self.decoding)
        elif self.lang == "en" and (self.use_wordrep_tree
                                    or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_trees_en(
                self.decoding, lr=lr)
        else:
            sys.exit("invalid option in PrepareHmmRep")

        self.dataset = None
Exemplo n.º 10
0
        n_sent += 1

dirname = prepare_dirname(hmm_type=hmm_type,
                          append_string=append_string,
                          lang=args.lang,
                          max_iter=max_iter,
                          N=start_n_states,
                          n_sent=n_sent,
                          alpha=alpha,
                          minibatch_size=minibatch_size)

if args.tree or args.rel or args.lr:
    if args.lang == "en":
        dataset = ConllCorpus(args.dataset,
                              howbig=n_sent,
                              lemmas=lemmas,
                              spec_rels=args.rel_spec_en,
                              dirname=dirname,
                              lr=args.lr)
    elif args.lang == "nl":
        dataset = ConllCorpus(args.dataset,
                              howbig=n_sent,
                              lemmas=lemmas,
                              spec_rels=args.rel_spec_nl,
                              dirname=dirname,
                              lr=args.lr)
    else:
        dataset = ConllCorpus(args.dataset,
                              howbig=n_sent,
                              lemmas=lemmas,
                              spec_rels=None,
                              dirname=dirname,
Exemplo n.º 11
0
if args.tree or args.rel or args.lr:
    reader = Conll07Reader(args.dataset)
    sent = reader.getNext()
    while sent:
        n_sent += 1
        sent = reader.getNext()
else:
    for l in line_reader(args.dataset):
        n_sent += 1

dirname = prepare_dirname(hmm_type=hmm_type, append_string=append_string, lang=args.lang, max_iter=max_iter,
                          N=start_n_states, n_sent=n_sent, alpha=alpha, minibatch_size=minibatch_size)

if args.tree or args.rel or args.lr:
    if args.lang == "en":
        dataset = ConllCorpus(args.dataset, howbig=n_sent, lemmas=lemmas, spec_rels=args.rel_spec_en,
                              dirname=dirname, lr=args.lr)
    elif args.lang == "nl":
        dataset = ConllCorpus(args.dataset, howbig=n_sent, lemmas=lemmas, spec_rels=args.rel_spec_nl,
                              dirname=dirname, lr=args.lr)
    else:
        dataset = ConllCorpus(args.dataset, howbig=n_sent, lemmas=lemmas, spec_rels=None,
                              dirname=dirname, lr=args.lr)
    n_rels = len(dataset.r_dict)
else:
    dataset = TextCorpus(args.dataset, howbig=n_sent)
    dataset.prepare_chains()

n_obs = len(dataset.x_dict)

writeout = args.writeout
Exemplo n.º 12
0
    path = args.rep_path
    posttype_f = "{}posttype_cumul.npy".format(path)
    n_states, n_obs, n_sent, n_toks, corpus_file, omit_class_cond, omit_emis_cond = read_params_from_path(
        path)
    lemmas = args.use_lemmas
    eval_spec_rel = True
    lr = False

    params_fixed = (np.load("{}ip.npy".format(path)),
                    np.load("{}tp.npy".format(path)),
                    np.load("{}fp.npy".format(path)),
                    np.load("{}ep.npy".format(path)))

    dataset = ConllCorpus("{}".format(corpus_file),
                          howbig=n_sent,
                          lemmas=lemmas,
                          eval_spec_rels=eval_spec_rel,
                          dirname=path,
                          lr=lr)
    dataset.train = dataset.prepare_trees_gen()  # generator
    h = HMRTM(n_states,
              n_obs,
              R=len(dataset.r_dict),
              params=params_fixed,
              writeout=False,
              dirname=path,
              omit_class_cond=omit_class_cond,
              omit_emis_cond=omit_emis_cond)

    h.obtain_posttypes_cumul(posttype_f, dataset, n_types=h.M, logger=None)
Exemplo n.º 13
0
 def create_vocab(self, dataset, lemmas):
     d = ConllCorpus(dataset, howbig=1e10, lemmas=lemmas)
     return d.x_dict