def __init__(self, corpus_file, minfreq=0, howbig=10000): """ :param minfreq: minimum frequency of a word in order to be taken into account :param howbig: number of sentences to take into account """ self.corpus_file = corpus_file self.vocab_file = "{}.vocab".format( self.corpus_file) # file of form: w\tf\n self.minfreq = minfreq self.howbig = howbig try: self.x_dict = LabelDictionary( read_vocab(self.vocab_file, self.minfreq)) except IOError: self.prepare_vocab_dict() self.x_dict = LabelDictionary( read_vocab(self.vocab_file, self.minfreq)) print("LabelDictionary created.")
def __init__(self): #observation vocabulary self.x_dict = LabelDictionary(["walk", "shop", "clean", "tennis"]) #training sequences train_seqs = SequenceList(self.x_dict) train_seqs.add_sequence(["walk", "walk", "shop", "clean"]) train_seqs.add_sequence(["walk", "walk", "shop", "clean"]) train_seqs.add_sequence(["walk", "shop", "shop", "clean"]) self.train = train_seqs
def prepare_seqs_nl(self, vocab_f): self.ner_corpus = Conll2002NerCorpus( wordrep_dict=LabelDictionary(read_vocab(vocab_f))) train_seq = self.ner_corpus.read_sequence_list_conll(ned_train) dev_seq = self.ner_corpus.read_sequence_list_conll(ned_dev) test_seq = self.ner_corpus.read_sequence_list_conll(ned_test) mapper_corpus(train_seq, self.embeddings) mapper_corpus(dev_seq, self.embeddings) mapper_corpus(test_seq, self.embeddings) return train_seq, dev_seq, test_seq
def get_w_reps(idx, w_reps, vocab): ws = [] reps = [] if not idx: return ws, reps w_dict = LabelDictionary(read_vocab(vocab)) for w, rep in w_reps: if w_dict.get_label_id(w) in idx: assert not np.isnan(np.sum(rep)) ws.append(w) reps.append(rep) return ws, reps
def __init__(self, wordrep_dict=None, eval_spec_rel=False, dirname=None, lr=False, use_wordrep_tree=False): """ :param wordrep_dict: x_dictionary from training of word representations :param use_wordrep_tree: use parse tree representations """ self.wordrep_dict = wordrep_dict if self.wordrep_dict is not None: self.word_dict = self.wordrep_dict.copy() else: self.word_dict = LabelDictionary() self.tag_dict = LabelDictionary() # ner tag self.use_wordrep_tree = use_wordrep_tree self.sequence_list = None # SequenceListLabel(self.word_dict, self.tag_dict, self.wordrep_dict) self.eval_spec_rel = eval_spec_rel self.dirname = dirname self.lr = lr # for conll2002 lemma format preparation: self.tree_vocab = None
def __init__(self, dataset): '''dataset is a sequence list.''' self.feature_dict = LabelDictionary() self.feature_list = [] self.add_features = False self.dataset = dataset #Speed up self.node_feature_cache = {} self.initial_state_feature_cache = {} self.final_state_feature_cache = {} self.edge_feature_cache = {} self.features_used = set()
def load_embed(embed_f, vocab_f): """ Reads the embedding file and returns the numpy matrix, where row ids corresponds to vocab ids """ w_dict = LabelDictionary(read_vocab(vocab_f)) with open(embed_f) as in_f: m, n = map(eval, in_f.readline().strip().split()) e_m = np.zeros((m - 1, n)) # embedding matrix; m-1 to leave out </s> for l in line_reader(embed_f, skip=1): w, *e = l.strip().split() assert len(e) == n if w not in w_dict: continue e_m[w_dict.get_label_id(w)] = e return e_m
def prepare_seqs_en(self, vocab_f): self.ner_corpus = Conll2003NerCorpus( wordrep_dict=LabelDictionary(read_vocab(vocab_f))) train_seq = self.ner_corpus.read_sequence_list_conll(eng_train) dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev) test_seq = self.ner_corpus.read_sequence_list_conll(eng_test) muc_seq = self.ner_corpus.read_sequence_list_conll( muc_test) if self.use_muc else None mapper_corpus(train_seq, self.embeddings) mapper_corpus(dev_seq, self.embeddings) mapper_corpus(test_seq, self.embeddings) if self.use_muc: mapper_corpus(muc_seq, self.embeddings) return train_seq, dev_seq, test_seq, muc_seq
def __init__(self): self.x_dict = LabelDictionary( ["write", "that", "code", "ROOT", "don't"]) self.train_trees = TreeList() tree_ex1 = Tree() # container for node_list and edge_list idx = self.x_dict.get_label_id("write") n0 = Node(len(tree_ex1), idx) # len is 0 tree_ex1.add_node(n0) idx = self.x_dict.get_label_id("that") n1 = Node(len(tree_ex1), idx) tree_ex1.add_node(n1) idx = self.x_dict.get_label_id("code") n2 = Node(len(tree_ex1), idx) tree_ex1.add_node(n2) idx = self.x_dict.get_label_id("ROOT") n3 = Node(len(tree_ex1), idx) tree_ex1.add_node(n3) tree_ex1.add_edge(Edge(n0, n2)) tree_ex1.add_edge(Edge(n2, n1)) tree_ex1.add_edge(Edge(n3, n0)) self.train_trees.add_tree(tree_ex1) tree_ex2 = Tree() idx = self.x_dict.get_label_id("don't") n0 = Node(len(tree_ex1), idx) # len is 0 tree_ex2.add_node(n0) idx = self.x_dict.get_label_id("write") n1 = Node(len(tree_ex1), idx) tree_ex2.add_node(n1) idx = self.x_dict.get_label_id("code") n2 = Node(len(tree_ex1), idx) tree_ex2.add_node(n2) idx = self.x_dict.get_label_id("ROOT") n3 = Node(len(tree_ex1), idx) tree_ex2.add_node(n3) tree_ex2.add_edge(Edge(n0, n1)) tree_ex2.add_edge(Edge(n1, n2)) tree_ex2.add_edge(Edge(n3, n0)) self.train_trees.add_tree(tree_ex2)
def __init__(self): self.word_dict = LabelDictionary() self.sequence_list = SequenceList(self.word_dict)
def __init__(self, corpus_file, minfreq=0, howbig=1000, lemmas=True, spec_rels=None, dirname=None, eval_spec_rels=False, lr=False): """ :param howbig: number of sentences to take into account """ self.corpus_file = corpus_file self.vocab_file = "{}.vocab{}".format(self.corpus_file, howbig) self.rel_file = "{}.rels.vocab{}".format(self.corpus_file, howbig) # dependency labels self.minfreq = minfreq self.howbig = howbig self.lemmas = lemmas self.lr = lr #read built vocab try: self.x_dict = LabelDictionary( read_vocab(self.vocab_file, self.minfreq)) #except FileNotFoundError: except IOError: self.prepare_vocab_dict() self.x_dict = LabelDictionary( read_vocab(self.vocab_file, self.minfreq)) print("LabelDictionary created.") if eval_spec_rels: # in evaluation try: import pickle self.r_dict = pickle.load( open("{}/r_dict.pickle".format(dirname), "rb")) except IOError: sys.exit("r_dict does not exist.") else: if self.lr: self.r_dict = RelationDictionary(["left", "right"]) self.r_dict.write("{}/r_dict.pickle".format(dirname)) else: try: r_dict = LabelDictionary( [l.strip() for l in open(self.rel_file)]) except IOError: self.prepare_rel_vocab_dict() r_dict = LabelDictionary( [l.strip() for l in open(self.rel_file)]) if spec_rels: self.r_dict = RelationDictionary(spec_rels) self.r_dict.add("OTHER") self.r_dict.add_fixed_id( (set(r_dict.names) - set(spec_rels)), self.r_dict.get_label_id("OTHER")) self.r_dict.write("{}/r_dict.pickle".format(dirname)) else: self.r_dict = r_dict print("Relation/LabelDictionary created.")
def get_w_indices(targets, vocab): if not targets: return {} w_dict = LabelDictionary(read_vocab(vocab)) return {w_dict.get_label_id(t) for t in targets if t in w_dict}
def __init__(self, wordrep_dict=None): self.word_dict = LabelDictionary() self.tag_dict = LabelDictionary() self.sequence_list = None