def __init__(self, corpus_file, minfreq=0, howbig=1000, lemmas=True, spec_rels=None, dirname=None, eval_spec_rels=False, lr=False): """ :param howbig: number of sentences to take into account """ self.corpus_file = corpus_file self.vocab_file = "{}.vocab{}".format(self.corpus_file, howbig) self.rel_file = "{}.rels.vocab{}".format(self.corpus_file, howbig) # dependency labels self.minfreq = minfreq self.howbig = howbig self.lemmas = lemmas self.lr = lr #read built vocab try: self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq)) #except FileNotFoundError: except IOError: self.prepare_vocab_dict() self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq)) print("LabelDictionary created.") if eval_spec_rels: # in evaluation try: import pickle self.r_dict = pickle.load(open("{}/r_dict.pickle".format(dirname), "rb")) except IOError: sys.exit("r_dict does not exist.") else: if self.lr: self.r_dict = RelationDictionary(["left", "right"]) self.r_dict.write("{}/r_dict.pickle".format(dirname)) else: try: r_dict = LabelDictionary([l.strip() for l in open(self.rel_file)]) except IOError: self.prepare_rel_vocab_dict() r_dict = LabelDictionary([l.strip() for l in open(self.rel_file)]) if spec_rels: self.r_dict = RelationDictionary(spec_rels) self.r_dict.add("OTHER") self.r_dict.add_fixed_id((set(r_dict.names) - set(spec_rels)), self.r_dict.get_label_id("OTHER")) self.r_dict.write("{}/r_dict.pickle".format(dirname)) else: self.r_dict = r_dict print("Relation/LabelDictionary created.")
def __init__(self, corpus_file, minfreq=0, howbig=10000): """ :param minfreq: minimum frequency of a word in order to be taken into account :param howbig: number of sentences to take into account """ self.corpus_file = corpus_file self.vocab_file = "{}.vocab".format(self.corpus_file) # file of form: w\tf\n self.minfreq = minfreq self.howbig = howbig try: self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq)) except IOError: self.prepare_vocab_dict() self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq)) print("LabelDictionary created.")
def similar_query(posttypes_file, words, vocab_file, top=20, measure="cosine"): vocab = LabelDictionary(read_vocab(vocab_file)) posttypes = np.load(posttypes_file) o = EmissionProb() return o.similarities_query(posttypes, words, vocab, top=top, measure=measure)
def __init__(self, corpus_file, minfreq=0, howbig=10000): """ :param minfreq: minimum frequency of a word in order to be taken into account :param howbig: number of sentences to take into account """ self.corpus_file = corpus_file self.vocab_file = "{}.vocab".format( self.corpus_file) # file of form: w\tf\n self.minfreq = minfreq self.howbig = howbig try: self.x_dict = LabelDictionary( read_vocab(self.vocab_file, self.minfreq)) except IOError: self.prepare_vocab_dict() self.x_dict = LabelDictionary( read_vocab(self.vocab_file, self.minfreq)) print("LabelDictionary created.")
def prepare_seqs_nl(self, vocab_f): self.ner_corpus = Conll2002NerCorpus(wordrep_dict=LabelDictionary(read_vocab(vocab_f))) train_seq = self.ner_corpus.read_sequence_list_conll(ned_train) dev_seq = self.ner_corpus.read_sequence_list_conll(ned_dev) test_seq = self.ner_corpus.read_sequence_list_conll(ned_test) mapper_corpus(train_seq, self.embeddings) mapper_corpus(dev_seq, self.embeddings) mapper_corpus(test_seq, self.embeddings) return train_seq, dev_seq, test_seq
def prepare_seqs_nl(self, vocab_f): self.ner_corpus = Conll2002NerCorpus( wordrep_dict=LabelDictionary(read_vocab(vocab_f))) train_seq = self.ner_corpus.read_sequence_list_conll(ned_train) dev_seq = self.ner_corpus.read_sequence_list_conll(ned_dev) test_seq = self.ner_corpus.read_sequence_list_conll(ned_test) mapper_corpus(train_seq, self.embeddings) mapper_corpus(dev_seq, self.embeddings) mapper_corpus(test_seq, self.embeddings) return train_seq, dev_seq, test_seq
def get_w_reps(idx, w_reps, vocab): ws = [] reps = [] if not idx: return ws, reps w_dict = LabelDictionary(read_vocab(vocab)) for w, rep in w_reps: if w_dict.get_label_id(w) in idx: assert not np.isnan(np.sum(rep)) ws.append(w) reps.append(rep) return ws, reps
def prepare_seqs_en(self, vocab_f): self.ner_corpus = Conll2003NerCorpus(wordrep_dict=LabelDictionary(read_vocab(vocab_f))) train_seq = self.ner_corpus.read_sequence_list_conll(eng_train) dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev) test_seq = self.ner_corpus.read_sequence_list_conll(eng_test) muc_seq = self.ner_corpus.read_sequence_list_conll(muc_test) if self.use_muc else None mapper_corpus(train_seq, self.embeddings) mapper_corpus(dev_seq, self.embeddings) mapper_corpus(test_seq, self.embeddings) if self.use_muc: mapper_corpus(muc_seq, self.embeddings) return train_seq, dev_seq, test_seq, muc_seq
def load_embed(embed_f, vocab_f): """ Reads the embedding file and returns the numpy matrix, where row ids corresponds to vocab ids """ w_dict = LabelDictionary(read_vocab(vocab_f)) with open(embed_f) as in_f: m, n = map(eval, in_f.readline().strip().split()) e_m = np.zeros((m - 1, n)) # embedding matrix; m-1 to leave out </s> for l in line_reader(embed_f, skip=1): w, *e = l.strip().split() assert len(e) == n if w not in w_dict: continue e_m[w_dict.get_label_id(w)] = e return e_m
def prepare_seqs_en(self, vocab_f): self.ner_corpus = Conll2003NerCorpus( wordrep_dict=LabelDictionary(read_vocab(vocab_f))) train_seq = self.ner_corpus.read_sequence_list_conll(eng_train) dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev) test_seq = self.ner_corpus.read_sequence_list_conll(eng_test) muc_seq = self.ner_corpus.read_sequence_list_conll( muc_test) if self.use_muc else None mapper_corpus(train_seq, self.embeddings) mapper_corpus(dev_seq, self.embeddings) mapper_corpus(test_seq, self.embeddings) if self.use_muc: mapper_corpus(muc_seq, self.embeddings) return train_seq, dev_seq, test_seq, muc_seq
def posttype_txt(posttypes, vocab_f, threedim, vocab_r): """ produce format as in word embeddings (readers.word2vec.py) :param posttypes: loaded posttype.npy file :param vocab_f: vocabulary of the training text used for obtaining posttype.npy """ w_dict = LabelDictionary(read_vocab(vocab_f)) if threedim: import pickle r_dict = pickle.load(open(vocab_r, "rb")) rep_iter = ((w_dict.get_label_name(c), rep) for c, rep in enumerate(posttypes)) return (("{}{}".format(w, r_dict.get_label_name(r)), rep[:, r]) for w, rep in rep_iter for r in range(rep.shape[1])) else: return ((w_dict.get_label_name(c), rep) for c, rep in enumerate(posttypes))
def get_w_indices(targets, vocab): if not targets: return {} w_dict = LabelDictionary(read_vocab(vocab)) return {w_dict.get_label_id(t) for t in targets if t in w_dict}
def __init__(self, corpus_file, minfreq=0, howbig=1000, lemmas=True, spec_rels=None, dirname=None, eval_spec_rels=False, lr=False): """ :param howbig: number of sentences to take into account """ self.corpus_file = corpus_file self.vocab_file = "{}.vocab{}".format(self.corpus_file, howbig) self.rel_file = "{}.rels.vocab{}".format(self.corpus_file, howbig) # dependency labels self.minfreq = minfreq self.howbig = howbig self.lemmas = lemmas self.lr = lr #read built vocab try: self.x_dict = LabelDictionary( read_vocab(self.vocab_file, self.minfreq)) #except FileNotFoundError: except IOError: self.prepare_vocab_dict() self.x_dict = LabelDictionary( read_vocab(self.vocab_file, self.minfreq)) print("LabelDictionary created.") if eval_spec_rels: # in evaluation try: import pickle self.r_dict = pickle.load( open("{}/r_dict.pickle".format(dirname), "rb")) except IOError: sys.exit("r_dict does not exist.") else: if self.lr: self.r_dict = RelationDictionary(["left", "right"]) self.r_dict.write("{}/r_dict.pickle".format(dirname)) else: try: r_dict = LabelDictionary( [l.strip() for l in open(self.rel_file)]) except IOError: self.prepare_rel_vocab_dict() r_dict = LabelDictionary( [l.strip() for l in open(self.rel_file)]) if spec_rels: self.r_dict = RelationDictionary(spec_rels) self.r_dict.add("OTHER") self.r_dict.add_fixed_id( (set(r_dict.names) - set(spec_rels)), self.r_dict.get_label_id("OTHER")) self.r_dict.write("{}/r_dict.pickle".format(dirname)) else: self.r_dict = r_dict print("Relation/LabelDictionary created.")