示例#1
0
    def __init__(self, corpus_file, minfreq=0, howbig=1000, lemmas=True, spec_rels=None, dirname=None,
                 eval_spec_rels=False, lr=False):

        """
        :param howbig: number of sentences to take into account
        """
        self.corpus_file = corpus_file
        self.vocab_file = "{}.vocab{}".format(self.corpus_file, howbig)
        self.rel_file = "{}.rels.vocab{}".format(self.corpus_file, howbig)  # dependency labels

        self.minfreq = minfreq
        self.howbig = howbig
        self.lemmas = lemmas
        self.lr = lr
        #read built vocab
        try:
            self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq))
        #except FileNotFoundError:
        except IOError:
            self.prepare_vocab_dict()
            self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq))
        print("LabelDictionary created.")

        if eval_spec_rels:  # in evaluation
            try:
                import pickle

                self.r_dict = pickle.load(open("{}/r_dict.pickle".format(dirname), "rb"))
            except IOError:
                sys.exit("r_dict does not exist.")
        else:
            if self.lr:
                self.r_dict = RelationDictionary(["left", "right"])
                self.r_dict.write("{}/r_dict.pickle".format(dirname))
            else:
                try:
                    r_dict = LabelDictionary([l.strip() for l in open(self.rel_file)])
                except IOError:
                    self.prepare_rel_vocab_dict()
                    r_dict = LabelDictionary([l.strip() for l in open(self.rel_file)])
                if spec_rels:
                    self.r_dict = RelationDictionary(spec_rels)
                    self.r_dict.add("OTHER")
                    self.r_dict.add_fixed_id((set(r_dict.names) - set(spec_rels)), self.r_dict.get_label_id("OTHER"))
                    self.r_dict.write("{}/r_dict.pickle".format(dirname))
                else:
                    self.r_dict = r_dict
        print("Relation/LabelDictionary created.")
示例#2
0
    def __init__(self, corpus_file, minfreq=0, howbig=10000):
        """
        :param minfreq: minimum frequency of a word in order to be taken into account
        :param howbig: number of sentences to take into account
        """
        self.corpus_file = corpus_file
        self.vocab_file = "{}.vocab".format(self.corpus_file)  # file of form: w\tf\n

        self.minfreq = minfreq
        self.howbig = howbig
        try:
            self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq))
        except IOError:
            self.prepare_vocab_dict()
            self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq))

        print("LabelDictionary created.")
示例#3
0
def similar_query(posttypes_file, words, vocab_file, top=20, measure="cosine"):
    vocab = LabelDictionary(read_vocab(vocab_file))
    posttypes = np.load(posttypes_file)
    o = EmissionProb()
    return o.similarities_query(posttypes,
                                words,
                                vocab,
                                top=top,
                                measure=measure)
示例#4
0
    def __init__(self, corpus_file, minfreq=0, howbig=10000):
        """
        :param minfreq: minimum frequency of a word in order to be taken into account
        :param howbig: number of sentences to take into account
        """
        self.corpus_file = corpus_file
        self.vocab_file = "{}.vocab".format(
            self.corpus_file)  # file of form: w\tf\n

        self.minfreq = minfreq
        self.howbig = howbig
        try:
            self.x_dict = LabelDictionary(
                read_vocab(self.vocab_file, self.minfreq))
        except IOError:
            self.prepare_vocab_dict()
            self.x_dict = LabelDictionary(
                read_vocab(self.vocab_file, self.minfreq))

        print("LabelDictionary created.")
示例#5
0
    def prepare_seqs_nl(self, vocab_f):
        self.ner_corpus = Conll2002NerCorpus(wordrep_dict=LabelDictionary(read_vocab(vocab_f)))

        train_seq = self.ner_corpus.read_sequence_list_conll(ned_train)
        dev_seq = self.ner_corpus.read_sequence_list_conll(ned_dev)
        test_seq = self.ner_corpus.read_sequence_list_conll(ned_test)

        mapper_corpus(train_seq, self.embeddings)
        mapper_corpus(dev_seq, self.embeddings)
        mapper_corpus(test_seq, self.embeddings)

        return train_seq, dev_seq, test_seq
示例#6
0
    def prepare_seqs_nl(self, vocab_f):
        self.ner_corpus = Conll2002NerCorpus(
            wordrep_dict=LabelDictionary(read_vocab(vocab_f)))

        train_seq = self.ner_corpus.read_sequence_list_conll(ned_train)
        dev_seq = self.ner_corpus.read_sequence_list_conll(ned_dev)
        test_seq = self.ner_corpus.read_sequence_list_conll(ned_test)

        mapper_corpus(train_seq, self.embeddings)
        mapper_corpus(dev_seq, self.embeddings)
        mapper_corpus(test_seq, self.embeddings)

        return train_seq, dev_seq, test_seq
示例#7
0
def get_w_reps(idx, w_reps, vocab):
    ws = []
    reps = []
    if not idx:
        return ws, reps

    w_dict = LabelDictionary(read_vocab(vocab))
    for w, rep in w_reps:
        if w_dict.get_label_id(w) in idx:
            assert not np.isnan(np.sum(rep))
            ws.append(w)
            reps.append(rep)

    return ws, reps
示例#8
0
def get_w_reps(idx, w_reps, vocab):
    ws = []
    reps = []
    if not idx:
        return ws, reps

    w_dict = LabelDictionary(read_vocab(vocab))
    for w, rep in w_reps:
        if w_dict.get_label_id(w) in idx:
            assert not np.isnan(np.sum(rep))
            ws.append(w)
            reps.append(rep)

    return ws, reps
示例#9
0
    def prepare_seqs_en(self, vocab_f):
        self.ner_corpus = Conll2003NerCorpus(wordrep_dict=LabelDictionary(read_vocab(vocab_f)))

        train_seq = self.ner_corpus.read_sequence_list_conll(eng_train)
        dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev)
        test_seq = self.ner_corpus.read_sequence_list_conll(eng_test)
        muc_seq = self.ner_corpus.read_sequence_list_conll(muc_test) if self.use_muc else None

        mapper_corpus(train_seq, self.embeddings)
        mapper_corpus(dev_seq, self.embeddings)
        mapper_corpus(test_seq, self.embeddings)
        if self.use_muc:
            mapper_corpus(muc_seq, self.embeddings)

        return train_seq, dev_seq, test_seq, muc_seq
示例#10
0
def load_embed(embed_f, vocab_f):
    """
    Reads the embedding file and returns the numpy matrix, where row ids corresponds to vocab ids
    """
    w_dict = LabelDictionary(read_vocab(vocab_f))
    with open(embed_f) as in_f:
        m, n = map(eval, in_f.readline().strip().split())
    e_m = np.zeros((m - 1, n))  # embedding matrix; m-1 to leave out </s>
    for l in line_reader(embed_f, skip=1):
        w, *e = l.strip().split()
        assert len(e) == n

        if w not in w_dict:
            continue
        e_m[w_dict.get_label_id(w)] = e
    return e_m
示例#11
0
def load_embed(embed_f, vocab_f):
    """
    Reads the embedding file and returns the numpy matrix, where row ids corresponds to vocab ids
    """
    w_dict = LabelDictionary(read_vocab(vocab_f))
    with open(embed_f) as in_f:
        m, n = map(eval, in_f.readline().strip().split())
    e_m = np.zeros((m - 1, n))  # embedding matrix; m-1 to leave out </s>
    for l in line_reader(embed_f, skip=1):
        w, *e = l.strip().split()
        assert len(e) == n

        if w not in w_dict:
            continue
        e_m[w_dict.get_label_id(w)] = e
    return e_m
示例#12
0
    def prepare_seqs_en(self, vocab_f):
        self.ner_corpus = Conll2003NerCorpus(
            wordrep_dict=LabelDictionary(read_vocab(vocab_f)))

        train_seq = self.ner_corpus.read_sequence_list_conll(eng_train)
        dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev)
        test_seq = self.ner_corpus.read_sequence_list_conll(eng_test)
        muc_seq = self.ner_corpus.read_sequence_list_conll(
            muc_test) if self.use_muc else None

        mapper_corpus(train_seq, self.embeddings)
        mapper_corpus(dev_seq, self.embeddings)
        mapper_corpus(test_seq, self.embeddings)
        if self.use_muc:
            mapper_corpus(muc_seq, self.embeddings)

        return train_seq, dev_seq, test_seq, muc_seq
示例#13
0
def posttype_txt(posttypes, vocab_f, threedim, vocab_r):
    """
    produce format as in word embeddings (readers.word2vec.py)
    :param posttypes: loaded posttype.npy file
    :param vocab_f: vocabulary of the training text used for obtaining posttype.npy
    """
    w_dict = LabelDictionary(read_vocab(vocab_f))

    if threedim:
        import pickle

        r_dict = pickle.load(open(vocab_r, "rb"))

        rep_iter = ((w_dict.get_label_name(c), rep) for c, rep in enumerate(posttypes))
        return (("{}{}".format(w, r_dict.get_label_name(r)), rep[:, r])
                for w, rep in rep_iter
                for r in range(rep.shape[1]))
    else:
        return ((w_dict.get_label_name(c), rep) for c, rep in enumerate(posttypes))
示例#14
0
def posttype_txt(posttypes, vocab_f, threedim, vocab_r):
    """
    produce format as in word embeddings (readers.word2vec.py)
    :param posttypes: loaded posttype.npy file
    :param vocab_f: vocabulary of the training text used for obtaining posttype.npy
    """
    w_dict = LabelDictionary(read_vocab(vocab_f))

    if threedim:
        import pickle

        r_dict = pickle.load(open(vocab_r, "rb"))

        rep_iter = ((w_dict.get_label_name(c), rep)
                    for c, rep in enumerate(posttypes))
        return (("{}{}".format(w, r_dict.get_label_name(r)), rep[:, r])
                for w, rep in rep_iter for r in range(rep.shape[1]))
    else:
        return ((w_dict.get_label_name(c), rep)
                for c, rep in enumerate(posttypes))
示例#15
0
def get_w_indices(targets, vocab):
    if not targets:
        return {}
    w_dict = LabelDictionary(read_vocab(vocab))
    return {w_dict.get_label_id(t) for t in targets if t in w_dict}
示例#16
0
def get_w_indices(targets, vocab):
    if not targets:
        return {}
    w_dict = LabelDictionary(read_vocab(vocab))
    return {w_dict.get_label_id(t) for t in targets if t in w_dict}
示例#17
0
def similar_query(posttypes_file, words, vocab_file, top=20, measure="cosine"):
    vocab = LabelDictionary(read_vocab(vocab_file))
    posttypes = np.load(posttypes_file)
    o = EmissionProb()
    return o.similarities_query(posttypes, words, vocab, top=top, measure=measure)
示例#18
0
    def __init__(self,
                 corpus_file,
                 minfreq=0,
                 howbig=1000,
                 lemmas=True,
                 spec_rels=None,
                 dirname=None,
                 eval_spec_rels=False,
                 lr=False):
        """
        :param howbig: number of sentences to take into account
        """
        self.corpus_file = corpus_file
        self.vocab_file = "{}.vocab{}".format(self.corpus_file, howbig)
        self.rel_file = "{}.rels.vocab{}".format(self.corpus_file,
                                                 howbig)  # dependency labels

        self.minfreq = minfreq
        self.howbig = howbig
        self.lemmas = lemmas
        self.lr = lr
        #read built vocab
        try:
            self.x_dict = LabelDictionary(
                read_vocab(self.vocab_file, self.minfreq))
        #except FileNotFoundError:
        except IOError:
            self.prepare_vocab_dict()
            self.x_dict = LabelDictionary(
                read_vocab(self.vocab_file, self.minfreq))
        print("LabelDictionary created.")

        if eval_spec_rels:  # in evaluation
            try:
                import pickle

                self.r_dict = pickle.load(
                    open("{}/r_dict.pickle".format(dirname), "rb"))
            except IOError:
                sys.exit("r_dict does not exist.")
        else:
            if self.lr:
                self.r_dict = RelationDictionary(["left", "right"])
                self.r_dict.write("{}/r_dict.pickle".format(dirname))
            else:
                try:
                    r_dict = LabelDictionary(
                        [l.strip() for l in open(self.rel_file)])
                except IOError:
                    self.prepare_rel_vocab_dict()
                    r_dict = LabelDictionary(
                        [l.strip() for l in open(self.rel_file)])
                if spec_rels:
                    self.r_dict = RelationDictionary(spec_rels)
                    self.r_dict.add("OTHER")
                    self.r_dict.add_fixed_id(
                        (set(r_dict.names) - set(spec_rels)),
                        self.r_dict.get_label_id("OTHER"))
                    self.r_dict.write("{}/r_dict.pickle".format(dirname))
                else:
                    self.r_dict = r_dict
        print("Relation/LabelDictionary created.")