def get_w_reps(idx, w_reps, vocab): ws = [] reps = [] if not idx: return ws, reps w_dict = LabelDictionary(read_vocab(vocab)) for w, rep in w_reps: if w_dict.get_label_id(w) in idx: assert not np.isnan(np.sum(rep)) ws.append(w) reps.append(rep) return ws, reps
def __init__(self, dataset): '''dataset is a sequence list.''' self.feature_dict = LabelDictionary() self.feature_list = [] self.add_features = False self.dataset = dataset #Speed up self.node_feature_cache = {} self.initial_state_feature_cache = {} self.final_state_feature_cache = {} self.edge_feature_cache = {} self.features_used = set()
def __init__(self, corpus_file, minfreq=0, howbig=1000, lemmas=True, spec_rels=None, dirname=None, eval_spec_rels=False, lr=False): """ :param howbig: number of sentences to take into account """ self.corpus_file = corpus_file self.vocab_file = "{}.vocab{}".format(self.corpus_file, howbig) self.rel_file = "{}.rels.vocab{}".format(self.corpus_file, howbig) # dependency labels self.minfreq = minfreq self.howbig = howbig self.lemmas = lemmas self.lr = lr #read built vocab try: self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq)) #except FileNotFoundError: except IOError: self.prepare_vocab_dict() self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq)) print("LabelDictionary created.") if eval_spec_rels: # in evaluation try: import pickle self.r_dict = pickle.load(open("{}/r_dict.pickle".format(dirname), "rb")) except IOError: sys.exit("r_dict does not exist.") else: if self.lr: self.r_dict = RelationDictionary(["left", "right"]) self.r_dict.write("{}/r_dict.pickle".format(dirname)) else: try: r_dict = LabelDictionary([l.strip() for l in open(self.rel_file)]) except IOError: self.prepare_rel_vocab_dict() r_dict = LabelDictionary([l.strip() for l in open(self.rel_file)]) if spec_rels: self.r_dict = RelationDictionary(spec_rels) self.r_dict.add("OTHER") self.r_dict.add_fixed_id((set(r_dict.names) - set(spec_rels)), self.r_dict.get_label_id("OTHER")) self.r_dict.write("{}/r_dict.pickle".format(dirname)) else: self.r_dict = r_dict print("Relation/LabelDictionary created.")
def load_embed(embed_f, vocab_f): """ Reads the embedding file and returns the numpy matrix, where row ids corresponds to vocab ids """ w_dict = LabelDictionary(read_vocab(vocab_f)) with open(embed_f) as in_f: m, n = map(eval, in_f.readline().strip().split()) e_m = np.zeros((m - 1, n)) # embedding matrix; m-1 to leave out </s> for l in line_reader(embed_f, skip=1): w, *e = l.strip().split() assert len(e) == n if w not in w_dict: continue e_m[w_dict.get_label_id(w)] = e return e_m
def __init__(self, wordrep_dict=None, eval_spec_rel=False, dirname=None, lr=False, use_wordrep_tree=False): """ :param wordrep_dict: x_dictionary from training of word representations :param use_wordrep_tree: use parse tree representations """ self.wordrep_dict = wordrep_dict if self.wordrep_dict is not None: self.word_dict = self.wordrep_dict.copy() else: self.word_dict = LabelDictionary() self.tag_dict = LabelDictionary() # ner tag self.use_wordrep_tree = use_wordrep_tree self.sequence_list = None # SequenceListLabel(self.word_dict, self.tag_dict, self.wordrep_dict) self.eval_spec_rel = eval_spec_rel self.dirname = dirname self.lr = lr # for conll2002 lemma format preparation: self.tree_vocab = None
def __init__(self): self.x_dict = LabelDictionary( ["write", "that", "code", "ROOT", "don't"]) self.train_trees = TreeList() tree_ex1 = Tree() # container for node_list and edge_list idx = self.x_dict.get_label_id("write") n0 = Node(len(tree_ex1), idx) # len is 0 tree_ex1.add_node(n0) idx = self.x_dict.get_label_id("that") n1 = Node(len(tree_ex1), idx) tree_ex1.add_node(n1) idx = self.x_dict.get_label_id("code") n2 = Node(len(tree_ex1), idx) tree_ex1.add_node(n2) idx = self.x_dict.get_label_id("ROOT") n3 = Node(len(tree_ex1), idx) tree_ex1.add_node(n3) tree_ex1.add_edge(Edge(n0, n2)) tree_ex1.add_edge(Edge(n2, n1)) tree_ex1.add_edge(Edge(n3, n0)) self.train_trees.add_tree(tree_ex1) tree_ex2 = Tree() idx = self.x_dict.get_label_id("don't") n0 = Node(len(tree_ex1), idx) # len is 0 tree_ex2.add_node(n0) idx = self.x_dict.get_label_id("write") n1 = Node(len(tree_ex1), idx) tree_ex2.add_node(n1) idx = self.x_dict.get_label_id("code") n2 = Node(len(tree_ex1), idx) tree_ex2.add_node(n2) idx = self.x_dict.get_label_id("ROOT") n3 = Node(len(tree_ex1), idx) tree_ex2.add_node(n3) tree_ex2.add_edge(Edge(n0, n1)) tree_ex2.add_edge(Edge(n1, n2)) tree_ex2.add_edge(Edge(n3, n0)) self.train_trees.add_tree(tree_ex2)
def __init__(self): #observation vocabulary self.x_dict = LabelDictionary(["walk", "shop", "clean", "tennis"]) #training sequences train_seqs = SequenceList(self.x_dict) train_seqs.add_sequence(["walk", "walk", "shop", "clean"]) train_seqs.add_sequence(["walk", "walk", "shop", "clean"]) train_seqs.add_sequence(["walk", "shop", "shop", "clean"]) self.train = train_seqs
def __init__(self, corpus_file, minfreq=0, howbig=10000): """ :param minfreq: minimum frequency of a word in order to be taken into account :param howbig: number of sentences to take into account """ self.corpus_file = corpus_file self.vocab_file = "{}.vocab".format( self.corpus_file) # file of form: w\tf\n self.minfreq = minfreq self.howbig = howbig try: self.x_dict = LabelDictionary( read_vocab(self.vocab_file, self.minfreq)) except IOError: self.prepare_vocab_dict() self.x_dict = LabelDictionary( read_vocab(self.vocab_file, self.minfreq)) print("LabelDictionary created.")
def prepare_seqs_nl(self, vocab_f): self.ner_corpus = Conll2002NerCorpus( wordrep_dict=LabelDictionary(read_vocab(vocab_f))) train_seq = self.ner_corpus.read_sequence_list_conll(ned_train) dev_seq = self.ner_corpus.read_sequence_list_conll(ned_dev) test_seq = self.ner_corpus.read_sequence_list_conll(ned_test) mapper_corpus(train_seq, self.embeddings) mapper_corpus(dev_seq, self.embeddings) mapper_corpus(test_seq, self.embeddings) return train_seq, dev_seq, test_seq
def prepare_seqs_en(self, vocab_f): self.ner_corpus = Conll2003NerCorpus( wordrep_dict=LabelDictionary(read_vocab(vocab_f))) train_seq = self.ner_corpus.read_sequence_list_conll(eng_train) dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev) test_seq = self.ner_corpus.read_sequence_list_conll(eng_test) muc_seq = self.ner_corpus.read_sequence_list_conll( muc_test) if self.use_muc else None mapper_corpus(train_seq, self.embeddings) mapper_corpus(dev_seq, self.embeddings) mapper_corpus(test_seq, self.embeddings) if self.use_muc: mapper_corpus(muc_seq, self.embeddings) return train_seq, dev_seq, test_seq, muc_seq
class IDFeatures: ''' Base class to extract features from a particular dataset. feature_dic --> Dictionary of all existing features maps feature_name (string) --> feature_id (int) feture_names --> List of feature names. Each position is the feature_id and contains the feature name nr_feats --> Total number of features feature_list --> For each sentence in the corpus contains a pair of node feature and edge features dataset --> The original dataset for which the features were extracted Caches (for speedup): initial_state_feature_cache --> node_feature_cache --> edge_feature_cache --> final_state_feature_cache --> ''' def __init__(self, dataset): '''dataset is a sequence list.''' self.feature_dict = LabelDictionary() self.feature_list = [] self.add_features = False self.dataset = dataset #Speed up self.node_feature_cache = {} self.initial_state_feature_cache = {} self.final_state_feature_cache = {} self.edge_feature_cache = {} self.features_used = set() def get_num_features(self): return len(self.feature_dict) def build_features(self): ''' Generic function to build features for a given dataset. Iterates through all sentences in the dataset and extracts its features, saving the node/edge features in feature list. ''' self.add_features = True for sequence in self.dataset.seq_list: initial_features, transition_features, final_features, emission_features = \ self.get_sequence_features(sequence) self.feature_list.append([initial_features, transition_features, final_features, emission_features]) self.add_features = False def get_sequence_features(self, sequence): ''' Returns the features for a given sequence. For a sequence of size N returns: Node_feature a list of size N. Each entry contains the node potentials for that position. Edge_features a list of size N+1. - Entry 0 contains the initial features - Entry N contains the final features - Entry i contains entries mapping the transition from i-1 to i. ''' emission_features = [] initial_features = [] transition_features = [] final_features = [] ## Take care of first position features = [] features = self.add_initial_features(sequence, sequence.y[0], features) initial_features.append(features) ## Take care of middle positions for pos, tag in enumerate(sequence.y): features = {} features = self.add_emission_features(sequence, pos, sequence.y[pos], features) emission_features.append(features) if pos > 0: prev_tag = sequence.y[pos-1] features = [] features = self.add_transition_features(sequence, pos-1, tag, prev_tag, features) transition_features.append(features) """ if pos > 1: prev_tag = sequence.y[pos-1] prev_prev_tag = sequence.y[pos-2] features = [] features = self.add_transition_features(sequence, pos-1, tag, prev_tag, prev_prev_tag, features) transition_features.append(features) """ ## Take care of final position features = [] features = self.add_final_features(sequence, sequence.y[-1], features) final_features.append(features) return initial_features, transition_features, final_features, emission_features #f(t,y_t,X) # Add the word identity and if position is # the first also adds the tag position def get_emission_features(self, sequence, pos, y): all_feat = [] x = sequence.x[pos] if x not in self.node_feature_cache: self.node_feature_cache[x] = {} if y not in self.node_feature_cache[x]: node_idx = [] node_idx = self.add_emission_features(sequence, pos, y, node_idx) self.node_feature_cache[x][y] = node_idx idx = self.node_feature_cache[x][y] all_feat = idx[:] return all_feat #f(t,y_t,y_(t-1),X) ##Speed up of code def get_transition_features(self, sequence, pos, y, y_prev): assert (pos >= 0 and pos < len(sequence.x)) if y not in self.edge_feature_cache: self.edge_feature_cache[y] = {} if y_prev not in self.edge_feature_cache[y]: edge_idx = [] edge_idx = self.add_transition_features(sequence, pos, y, y_prev, edge_idx) self.edge_feature_cache[y][y_prev] = edge_idx return self.edge_feature_cache[y][y_prev] def get_initial_features(self, sequence, y): if y not in self.initial_state_feature_cache: edge_idx = [] edge_idx = self.add_initial_features(sequence, y, edge_idx) self.initial_state_feature_cache[y] = edge_idx return self.initial_state_feature_cache[y] def get_final_features(self, sequence, y_prev): if y_prev not in self.final_state_feature_cache: edge_idx = [] edge_idx = self.add_final_features(sequence, y_prev, edge_idx) self.final_state_feature_cache[y_prev] = edge_idx return self.final_state_feature_cache[y_prev] def add_initial_features(self, sequence, y, features): # Get label name from ID. y_name = self.dataset.y_dict.get_label_name(y) # Generate feature name. feat_name = "init_tag:{}".format(y_name) self.features_used.add("init_tag") # Get feature ID from name. feat_id = self.add_feature(feat_name) # Append feature. if feat_id != -1: features.append(feat_id) return features def add_final_features(self, sequence, y_prev, features): # Get label name from ID. y_name = self.dataset.y_dict.get_label_name(y_prev) # Generate feature name. feat_name = "final_prev_tag:{}".format(y_name) self.features_used.add("final_prev_tag") # Get feature ID from name. feat_id = self.add_feature(feat_name) # Append feature. if(feat_id != -1): features.append(feat_id) return features def add_emission_features(self, sequence, pos, y, features): '''Add word-tag pair feature.''' x = sequence.x[pos] # Get tag name from ID. y_name = self.dataset.y_dict.get_label_name(y) # Get word name from ID. x_name = self.dataset.x_dict.get_label_name(x) # Generate feature name. feat_name = "id:{}::{}".format(x_name,y_name) self.features_used.add("id") # Get feature ID from name. feat_id = self.add_feature(feat_name) # Append feature. if feat_id != -1: features.append(feat_id) return features def add_transition_features(self, sequence, pos, y, y_prev, features): """ Adds a feature to the edge feature list. Creates a unique id if its the first time the feature is visited or returns the existing id otherwise """ assert pos < len(sequence.x)-1 # Get label name from ID. y_name = self.dataset.y_dict.get_label_name(y) # Get previous label name from ID. y_prev_name = self.dataset.y_dict.get_label_name(y_prev) # Generate feature name. feat_name = "prev_tag:{}::{}".format(y_prev_name, y_name) self.features_used.add("prev_tag") # Get feature ID from name. feat_id = self.add_feature(feat_name) # Append feature. if feat_id != -1: features.append(feat_id) return features def add_feature(self, feat_name): """ Builds a dictionary of feature name to feature id If we are at test time and we don't have the feature we return -1. """ # Check if feature exists and if so, return the feature ID. if feat_name in self.feature_dict: return self.feature_dict[feat_name] # If 'add_features' is True, add the feature to the feature # dictionary and return the feature ID. Otherwise return -1. if not self.add_features: return -1 return self.feature_dict.add(feat_name)
def get_w_indices(targets, vocab): if not targets: return {} w_dict = LabelDictionary(read_vocab(vocab)) return {w_dict.get_label_id(t) for t in targets if t in w_dict}
class Conll2002NerCorpus(): """ Optionally reads text to which we want to apply a wordrep such as hmm. - no update of the wordrep_dict; every word not in it (from x_dict), gets *unk* id needed for successful decoding " """ def __init__(self, wordrep_dict=None, eval_spec_rel=False, dirname=None, lr=False, use_wordrep_tree=False): """ :param wordrep_dict: x_dictionary from training of word representations :param use_wordrep_tree: use parse tree representations """ self.wordrep_dict = wordrep_dict if self.wordrep_dict is not None: self.word_dict = self.wordrep_dict.copy() else: self.word_dict = LabelDictionary() self.tag_dict = LabelDictionary() # ner tag self.use_wordrep_tree = use_wordrep_tree self.sequence_list = None # SequenceListLabel(self.word_dict, self.tag_dict, self.wordrep_dict) self.eval_spec_rel = eval_spec_rel self.dirname = dirname self.lr = lr # for conll2002 lemma format preparation: self.tree_vocab = None def read_sequence_list_conll(self, train_file, train_file_parsed=None, train_files_parsed_path=None, max_sent_len=100000, max_nr_sent=100000): """ Read a conll2002 or conll2003 file into a sequence list. Optionally add a sequence list/tree with *unk* for decoding in wordrep. """ instance_list = self.read_conll_instances(train_file, train_file_parsed, train_files_parsed_path, max_sent_len, max_nr_sent) if self.wordrep_dict is not None: seq_list = SequenceListLabel(self.word_dict, self.tag_dict, self.wordrep_dict) # for indices for sent_x, sent_y, sent_ in instance_list: # sent_ is a normalized tree if self.use_wordrep_tree: seq_list.add_sequence(sent_x, sent_y, None, sent_) # sent is a normalized chain else: seq_list.add_sequence(sent_x, sent_y, sent_) else: seq_list = SequenceListLabel(self.word_dict, self.tag_dict) # for indices for sent_x, sent_y in instance_list: seq_list.add_sequence(sent_x, sent_y) return seq_list def read_conll_instances(self, file, file_parsed, files_parsed_path, max_sent_len, max_nr_sent): """ TODO: refactor the entire method, lots of overlap chain/tree/token/lemma """ def get_tree(n_inst): trees = ConllCorpus(file_parsed, howbig=1000000, lemmas=True, eval_spec_rels=self.eval_spec_rel, dirname=self.dirname, lr=self.lr) trees.prepare_trees() # not every instance has a corresponding tree due to errors in parsing conll_idx = ConllFilesIndex(files_parsed_path) conll_idx.create_ids_set() # extend instances with trees c_append = 0 for i in range(n_inst): # we have a parse: if i + 1 in conll_idx.fileids: inst = self.normalize_tree(trees.train[c_append], trees.x_dict, c_append) c_append += 1 # we don't have a parse: else: inst = None yield inst if self.use_wordrep_tree: if file_parsed is None or files_parsed_path is None: sys.exit("Missing parsed file.") contents = open(file, encoding="iso-8859-1") nr_sent = 0 instances = [] ex_x = [] ex_y = [] include_ex_z = (self.wordrep_dict is not None and not self.use_wordrep_tree) if include_ex_z: ex_z = [] for line in contents: if line.startswith("-DOCSTART"): continue toks = line.split() if len(toks) < 3: if 0 < len( ex_x ) < max_sent_len: # len(ex_x) > 1 # escape one-word sentences nr_sent += 1 instances.append( [ex_x, ex_y, ex_z] if include_ex_z else [ex_x, ex_y]) if nr_sent >= max_nr_sent: break ex_x = [] ex_y = [] else: tag = toks[2] word = toks[0] if word not in self.word_dict: self.word_dict.add(word) if tag not in self.tag_dict: self.tag_dict.add(tag) ex_x.append(word) ex_y.append(tag) if include_ex_z: ex_z.append(self.normalize_word(word)) # add parsed data to use tree wordreps if self.use_wordrep_tree: for c, instance in enumerate(get_tree(len(instances))): # get parsed data inst = instance instances[c].append(inst) return instances # try generator def prepare_lemmatized_conll2002(self, train_file, train_file_parsed=None, train_files_parsed_path=None, output_f=None): self.use_wordrep_tree = True # need parsed data docstarts, instances = self.prepare_conll_instances( train_file, train_file_parsed, train_files_parsed_path) if output_f is None: return instances else: header = "-DOCSTART- -DOCSTART- O" with open(output_f, "w") as outfile: for n, instance in enumerate(instances): # doc headers if n in docstarts: outfile.write("{}\n".format(header)) if isinstance(instance, list): for _, postag, tag, lemma in zip(*instance): outfile.write("{} {} {}\n".format( lemma, postag, tag)) outfile.write("\n") else: sys.exit("invalid instance") def prepare_conll_instances(self, file, file_parsed, files_parsed_path): def get_tree(n_inst): trees = ConllCorpus(file_parsed, howbig=1000000, lemmas=True, eval_spec_rels=self.eval_spec_rel, dirname=self.dirname, lr=self.lr) trees.prepare_trees() self.tree_vocab = trees.x_dict # not every instance has a corresponding tree due to errors in parsing conll_idx = ConllFilesIndex(files_parsed_path) conll_idx.create_ids_set() # extend instances with trees c_append = 0 for i in range(n_inst): # we have a parse: if i + 1 in conll_idx.fileids: inst = trees.train[c_append] c_append += 1 # we don't have a parse: else: inst = None yield inst max_sent_len = 1000000 max_nr_sent = 1000000 if file_parsed is None or files_parsed_path is None: sys.exit("Missing parsed file.") contents = open(file, encoding="iso-8859-1") nr_sent = 0 instances = [] ex_x = [] ex_x_pos = [] ex_y = [] docstarts = set() # track docstarts header for line in contents: if line.startswith("-DOCSTART"): docstarts.add(nr_sent) continue toks = line.split() if len(toks) < 3: if 0 < len( ex_x ) < max_sent_len: # len(ex_x) > 1 # escape one-word sentences nr_sent += 1 instance = [ex_x, ex_x_pos, ex_y] instances.append(instance) if nr_sent >= max_nr_sent: break ex_x = [] ex_x_pos = [] ex_y = [] else: tag = toks[2] postag = toks[1] word = toks[0] ex_x.append(word) ex_x_pos.append(postag) ex_y.append(tag) for c, instance in enumerate(get_tree(len(instances))): ex_z = self.get_words( instance, self.tree_vocab) # should get lemmas (from ConllCorpus) if ex_z is None: inst = [i for i in instances[c][0]] print("None instance") else: assert len(ex_z) == len(instances[c][0]) inst = ex_z instances[c].append(inst) return docstarts, instances # try generator def normalize_word(self, word): if word not in self.wordrep_dict: return "*unk*" if word.lower( ) not in self.wordrep_dict else word.lower() else: return word def normalize_tree(self, tree, trees_vocab, c): """ Recode the name index based on wordrep_dict. Modify tree.name such that *unk* or lowercase words are included. """ for node in tree: w = trees_vocab.get_label_name(node.name) # if c==0: # print("{}\t{}".format(w, self.normalize_word(w))) new_name = self.wordrep_dict.get_label_id(self.normalize_word(w)) node.set_name(new_name) return tree def get_words(self, instance, vocab): if isinstance(instance, Tree): return get_words_from_tree(instance, vocab) print("None instance in Conll2002NerCorpus") return None def write_conll_instances(self, gold, predictions, file, sep=" "): """ Create dataset with appended predictions as the last column. """ assert len(gold) == len(predictions) contents = open(file, "w", encoding="iso-8859-1") for gold_seq, pred_seq in zip(gold.seq_list, predictions): for x, y, y_hat in zip(gold_seq.x, gold_seq.y, pred_seq.y): contents.write("{}{sep}{}{sep}{}\n".format( gold_seq.sequence_list.x_dict.get_label_name(x), gold_seq.sequence_list.y_dict.get_label_name(y), pred_seq.sequence_list.y_dict.get_label_name(y_hat), sep=sep)) contents.write("\n") # # Dumps a corpus into a file def save_corpus(self, dirname): if not os.path.isdir(dirname + "/"): os.mkdir(dirname + "/") #word_fn = open(dir+"word.dic","w") #for word_id,word in enumerate(self.int_to_word): # word_fn.write("{}\t{}\n".format(word_id, word)) #word_fn.close() #tag_fn = open(dir+"tag.dic","w") #for tag_id,tag in enumerate(self.int_to_tag): # tag_fn.write("{}\t{}\n".format(tag_id, tag)) #tag_fn.close() #word_count_fn = open(dir+"word.count","w") #for word_id,counts in self.word_counts.iteritems(): # word_count_fn.write("{}\t{}\n".format(word_id,counts)) #word_count_fn.close() self.sequence_list.save(dirname + "sequence_list") ## Loads a corpus from a file def load_corpus(self, dirname): word_fn = open(dirname + "word.dic") for line in word_fn: word_nr, word = line.strip().split("\t") self.int_to_word.append(word) self.word_dict[word] = int(word_nr) word_fn.close() tag_fn = open(dirname + "tag.dic") for line in tag_fn: tag_nr, tag = line.strip().split("\t") if tag not in self.tag_dict: self.int_to_tag.append(tag) self.tag_dict[tag] = int(tag_nr) tag_fn.close() word_count_fn = open(dirname + "word.count") for line in word_count_fn: word_nr, word_count = line.strip().split("\t") self.word_counts[int(word_nr)] = int(word_count) word_count_fn.close() self.sequence_list.load(dirname + "sequence_list")
def __init__(self): self.word_dict = LabelDictionary() self.sequence_list = SequenceList(self.word_dict)
class ConllCorpus: def __init__(self, corpus_file, minfreq=0, howbig=1000, lemmas=True, spec_rels=None, dirname=None, eval_spec_rels=False, lr=False): """ :param howbig: number of sentences to take into account """ self.corpus_file = corpus_file self.vocab_file = "{}.vocab{}".format(self.corpus_file, howbig) self.rel_file = "{}.rels.vocab{}".format(self.corpus_file, howbig) # dependency labels self.minfreq = minfreq self.howbig = howbig self.lemmas = lemmas self.lr = lr #read built vocab try: self.x_dict = LabelDictionary( read_vocab(self.vocab_file, self.minfreq)) #except FileNotFoundError: except IOError: self.prepare_vocab_dict() self.x_dict = LabelDictionary( read_vocab(self.vocab_file, self.minfreq)) print("LabelDictionary created.") if eval_spec_rels: # in evaluation try: import pickle self.r_dict = pickle.load( open("{}/r_dict.pickle".format(dirname), "rb")) except IOError: sys.exit("r_dict does not exist.") else: if self.lr: self.r_dict = RelationDictionary(["left", "right"]) self.r_dict.write("{}/r_dict.pickle".format(dirname)) else: try: r_dict = LabelDictionary( [l.strip() for l in open(self.rel_file)]) except IOError: self.prepare_rel_vocab_dict() r_dict = LabelDictionary( [l.strip() for l in open(self.rel_file)]) if spec_rels: self.r_dict = RelationDictionary(spec_rels) self.r_dict.add("OTHER") self.r_dict.add_fixed_id( (set(r_dict.names) - set(spec_rels)), self.r_dict.get_label_id("OTHER")) self.r_dict.write("{}/r_dict.pickle".format(dirname)) else: self.r_dict = r_dict print("Relation/LabelDictionary created.") def prepare_trees(self): self.train = TreeList() #print(self.train) reader = Conll07Reader(self.corpus_file) sent = reader.getNext() c = 1 while sent and (c <= self.howbig): t = self.prepare(sent, lr=self.lr) if t is not None: self.train.add_tree(t) #tracker.create_snapshot() #tracker.stats.print_summary() sent = reader.getNext() c += 1 def prepare_trees_gen(self): reader = Conll07Reader(self.corpus_file) sent = reader.getNext() c = 1 while sent and (c <= self.howbig): t = self.prepare(sent, lr=self.lr) if t is not None: yield t #tracker.create_snapshot() #tracker.stats.print_summary() sent = reader.getNext() c += 1 def prepare(self, sent, lr=False): t = BPTree() #tracker = ClassTracker() #tracker.track_object(t) #tracker.create_snapshot() #1.pass: create nodes elems = sent.getSentenceLemmas() if self.lemmas else sent.getSentence() if lr: for w, i in zip(elems, sent.getIds()): idx = self.x_dict.get_label_id(w) t.add_node(BPNode(i, idx)) else: for w, i, r in zip(elems, sent.getIds(), sent.deprel): idx = self.x_dict.get_label_id(w) ridx = self.r_dict.get_label_id(r) t.add_node(BPNode(i, idx, rel=ridx)) #add root #tracker.create_snapshot("add words of sent") idx = self.x_dict.get_label_id("*root*") t.add_node(BPNode(0, idx)) #tracker.create_snapshot("add ROOT") #2.pass: create edges seen = set() # catch direct loops for i, i_head in sent.getHeads(): # this only catches direct loops; TODO: use is_acyclic check if (i, i_head) in seen or (i_head, i) in seen: print("Tree with loop caught") t = None break else: seen.add((i, i_head)) if i == i_head: # not allowed print("Skipping sentence: parent is its own child") t = None break parent = t[i_head] child = t[i] if lr: child.rel = self.r_dict.get_label_id( "left") if i_head > i else self.r_dict.get_label_id( "right") #w occurs left/right of its parent if parent is None or child is None: print() edge = BPEdge(parent, child) t.add_edge(edge) #tracker.create_snapshot("add edge") t.add_edge_to_map(parent, child, edge) #tracker.create_snapshot("add edge to map") return t def prepare_vocab_dict(self): reader = Conll07Reader(self.corpus_file) vocab_dict = reader.getVocabulary(n_sent=self.howbig, add_root=True, lemmas=self.lemmas) with open(self.vocab_file, "w") as OUT: for w, f in vocab_dict.items(): OUT.write("{}\t{}\n".format(w, f)) print("Vocabulary file prepared.") def prepare_rel_vocab_dict(self): reader = Conll07Reader(self.corpus_file) vocab = reader.getRelationVocabulary(n_sent=self.howbig) with open(self.rel_file, "w") as OUT: for r in vocab: OUT.write("{}\n".format(r)) print("Relation vocabulary file prepared.")
def __init__(self, corpus_file, minfreq=0, howbig=1000, lemmas=True, spec_rels=None, dirname=None, eval_spec_rels=False, lr=False): """ :param howbig: number of sentences to take into account """ self.corpus_file = corpus_file self.vocab_file = "{}.vocab{}".format(self.corpus_file, howbig) self.rel_file = "{}.rels.vocab{}".format(self.corpus_file, howbig) # dependency labels self.minfreq = minfreq self.howbig = howbig self.lemmas = lemmas self.lr = lr #read built vocab try: self.x_dict = LabelDictionary( read_vocab(self.vocab_file, self.minfreq)) #except FileNotFoundError: except IOError: self.prepare_vocab_dict() self.x_dict = LabelDictionary( read_vocab(self.vocab_file, self.minfreq)) print("LabelDictionary created.") if eval_spec_rels: # in evaluation try: import pickle self.r_dict = pickle.load( open("{}/r_dict.pickle".format(dirname), "rb")) except IOError: sys.exit("r_dict does not exist.") else: if self.lr: self.r_dict = RelationDictionary(["left", "right"]) self.r_dict.write("{}/r_dict.pickle".format(dirname)) else: try: r_dict = LabelDictionary( [l.strip() for l in open(self.rel_file)]) except IOError: self.prepare_rel_vocab_dict() r_dict = LabelDictionary( [l.strip() for l in open(self.rel_file)]) if spec_rels: self.r_dict = RelationDictionary(spec_rels) self.r_dict.add("OTHER") self.r_dict.add_fixed_id( (set(r_dict.names) - set(spec_rels)), self.r_dict.get_label_id("OTHER")) self.r_dict.write("{}/r_dict.pickle".format(dirname)) else: self.r_dict = r_dict print("Relation/LabelDictionary created.")
class ConllCorpus: def __init__(self, corpus_file, minfreq=0, howbig=1000, lemmas=True, spec_rels=None, dirname=None, eval_spec_rels=False, lr=False): """ :param howbig: number of sentences to take into account """ self.corpus_file = corpus_file self.vocab_file = "{}.vocab{}".format(self.corpus_file, howbig) self.rel_file = "{}.rels.vocab{}".format(self.corpus_file, howbig) # dependency labels self.minfreq = minfreq self.howbig = howbig self.lemmas = lemmas self.lr = lr #read built vocab try: self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq)) #except FileNotFoundError: except IOError: self.prepare_vocab_dict() self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq)) print("LabelDictionary created.") if eval_spec_rels: # in evaluation try: import pickle self.r_dict = pickle.load(open("{}/r_dict.pickle".format(dirname), "rb")) except IOError: sys.exit("r_dict does not exist.") else: if self.lr: self.r_dict = RelationDictionary(["left", "right"]) self.r_dict.write("{}/r_dict.pickle".format(dirname)) else: try: r_dict = LabelDictionary([l.strip() for l in open(self.rel_file)]) except IOError: self.prepare_rel_vocab_dict() r_dict = LabelDictionary([l.strip() for l in open(self.rel_file)]) if spec_rels: self.r_dict = RelationDictionary(spec_rels) self.r_dict.add("OTHER") self.r_dict.add_fixed_id((set(r_dict.names) - set(spec_rels)), self.r_dict.get_label_id("OTHER")) self.r_dict.write("{}/r_dict.pickle".format(dirname)) else: self.r_dict = r_dict print("Relation/LabelDictionary created.") def prepare_trees(self): self.train = TreeList() #print(self.train) reader = Conll07Reader(self.corpus_file) sent = reader.getNext() c = 1 while sent and (c <= self.howbig): t = self.prepare(sent, lr=self.lr) if t is not None: self.train.add_tree(t) #tracker.create_snapshot() #tracker.stats.print_summary() sent = reader.getNext() c += 1 def prepare_trees_gen(self): reader = Conll07Reader(self.corpus_file) sent = reader.getNext() c = 1 while sent and (c <= self.howbig): t = self.prepare(sent, lr=self.lr) if t is not None: yield t #tracker.create_snapshot() #tracker.stats.print_summary() sent = reader.getNext() c += 1 def prepare(self, sent, lr=False): t = BPTree() #tracker = ClassTracker() #tracker.track_object(t) #tracker.create_snapshot() #1.pass: create nodes elems = sent.getSentenceLemmas() if self.lemmas else sent.getSentence() if lr: for w, i in zip(elems, sent.getIds()): idx = self.x_dict.get_label_id(w) t.add_node(BPNode(i, idx)) else: for w, i, r in zip(elems, sent.getIds(), sent.deprel): idx = self.x_dict.get_label_id(w) ridx = self.r_dict.get_label_id(r) t.add_node(BPNode(i, idx, rel=ridx)) #add root #tracker.create_snapshot("add words of sent") idx = self.x_dict.get_label_id("*root*") t.add_node(BPNode(0, idx)) #tracker.create_snapshot("add ROOT") #2.pass: create edges seen = set() # catch direct loops for i, i_head in sent.getHeads(): # this only catches direct loops; TODO: use is_acyclic check if (i, i_head) in seen or (i_head, i) in seen: print("Tree with loop caught") t = None break else: seen.add((i, i_head)) if i == i_head: # not allowed print("Skipping sentence: parent is its own child") t = None break parent = t[i_head] child = t[i] if lr: child.rel = self.r_dict.get_label_id("left") if i_head > i else self.r_dict.get_label_id( "right") #w occurs left/right of its parent if parent is None or child is None: print() edge = BPEdge(parent, child) t.add_edge(edge) #tracker.create_snapshot("add edge") t.add_edge_to_map(parent, child, edge) #tracker.create_snapshot("add edge to map") return t def prepare_vocab_dict(self): reader = Conll07Reader(self.corpus_file) vocab_dict = reader.getVocabulary(n_sent=self.howbig, add_root=True, lemmas=self.lemmas) with open(self.vocab_file, "w") as OUT: for w, f in vocab_dict.items(): OUT.write("{}\t{}\n".format(w, f)) print("Vocabulary file prepared.") def prepare_rel_vocab_dict(self): reader = Conll07Reader(self.corpus_file) vocab = reader.getRelationVocabulary(n_sent=self.howbig) with open(self.rel_file, "w") as OUT: for r in vocab: OUT.write("{}\n".format(r)) print("Relation vocabulary file prepared.")
class Conll2002NerCorpus(): """ Optionally reads text to which we want to apply a wordrep such as hmm. - no update of the wordrep_dict; every word not in it (from x_dict), gets *unk* id needed for successful decoding " """ def __init__(self, wordrep_dict=None, eval_spec_rel=False, dirname=None, lr=False, use_wordrep_tree=False): """ :param wordrep_dict: x_dictionary from training of word representations :param use_wordrep_tree: use parse tree representations """ self.wordrep_dict = wordrep_dict if self.wordrep_dict is not None: self.word_dict = self.wordrep_dict.copy() else: self.word_dict = LabelDictionary() self.tag_dict = LabelDictionary() # ner tag self.use_wordrep_tree = use_wordrep_tree self.sequence_list = None # SequenceListLabel(self.word_dict, self.tag_dict, self.wordrep_dict) self.eval_spec_rel = eval_spec_rel self.dirname = dirname self.lr = lr # for conll2002 lemma format preparation: self.tree_vocab = None def read_sequence_list_conll(self, train_file, train_file_parsed=None, train_files_parsed_path=None, max_sent_len=100000, max_nr_sent=100000): """ Read a conll2002 or conll2003 file into a sequence list. Optionally add a sequence list/tree with *unk* for decoding in wordrep. """ instance_list = self.read_conll_instances(train_file, train_file_parsed, train_files_parsed_path, max_sent_len, max_nr_sent) if self.wordrep_dict is not None: seq_list = SequenceListLabel(self.word_dict, self.tag_dict, self.wordrep_dict) # for indices for sent_x, sent_y, sent_ in instance_list: # sent_ is a normalized tree if self.use_wordrep_tree: seq_list.add_sequence(sent_x, sent_y, None, sent_) # sent is a normalized chain else: seq_list.add_sequence(sent_x, sent_y, sent_) else: seq_list = SequenceListLabel(self.word_dict, self.tag_dict) # for indices for sent_x, sent_y in instance_list: seq_list.add_sequence(sent_x, sent_y) return seq_list def read_conll_instances(self, file, file_parsed, files_parsed_path, max_sent_len, max_nr_sent): """ TODO: refactor the entire method, lots of overlap chain/tree/token/lemma """ def get_tree(n_inst): trees = ConllCorpus(file_parsed, howbig=1000000, lemmas=True, eval_spec_rels=self.eval_spec_rel, dirname=self.dirname, lr=self.lr) trees.prepare_trees() # not every instance has a corresponding tree due to errors in parsing conll_idx = ConllFilesIndex(files_parsed_path) conll_idx.create_ids_set() # extend instances with trees c_append = 0 for i in range(n_inst): # we have a parse: if i + 1 in conll_idx.fileids: inst = self.normalize_tree(trees.train[c_append], trees.x_dict, c_append) c_append += 1 # we don't have a parse: else: inst = None yield inst if self.use_wordrep_tree: if file_parsed is None or files_parsed_path is None: sys.exit("Missing parsed file.") contents = open(file, encoding="iso-8859-1") nr_sent = 0 instances = [] ex_x = [] ex_y = [] include_ex_z = (self.wordrep_dict is not None and not self.use_wordrep_tree) if include_ex_z: ex_z = [] for line in contents: if line.startswith("-DOCSTART"): continue toks = line.split() if len(toks) < 3: if 0 < len(ex_x) < max_sent_len: # len(ex_x) > 1 # escape one-word sentences nr_sent += 1 instances.append([ex_x, ex_y, ex_z] if include_ex_z else [ex_x, ex_y]) if nr_sent >= max_nr_sent: break ex_x = [] ex_y = [] else: tag = toks[2] word = toks[0] if word not in self.word_dict: self.word_dict.add(word) if tag not in self.tag_dict: self.tag_dict.add(tag) ex_x.append(word) ex_y.append(tag) if include_ex_z: ex_z.append(self.normalize_word(word)) # add parsed data to use tree wordreps if self.use_wordrep_tree: for c, instance in enumerate(get_tree(len(instances))): # get parsed data inst = instance instances[c].append(inst) return instances # try generator def prepare_lemmatized_conll2002(self, train_file, train_file_parsed=None, train_files_parsed_path=None, output_f=None): self.use_wordrep_tree = True # need parsed data docstarts, instances = self.prepare_conll_instances(train_file, train_file_parsed, train_files_parsed_path) if output_f is None: return instances else: header = "-DOCSTART- -DOCSTART- O" with open(output_f, "w") as outfile: for n, instance in enumerate(instances): # doc headers if n in docstarts: outfile.write("{}\n".format(header)) if isinstance(instance, list): for _, postag, tag, lemma in zip(*instance): outfile.write("{} {} {}\n".format(lemma, postag, tag)) outfile.write("\n") else: sys.exit("invalid instance") def prepare_conll_instances(self, file, file_parsed, files_parsed_path): def get_tree(n_inst): trees = ConllCorpus(file_parsed, howbig=1000000, lemmas=True, eval_spec_rels=self.eval_spec_rel, dirname=self.dirname, lr=self.lr) trees.prepare_trees() self.tree_vocab = trees.x_dict # not every instance has a corresponding tree due to errors in parsing conll_idx = ConllFilesIndex(files_parsed_path) conll_idx.create_ids_set() # extend instances with trees c_append = 0 for i in range(n_inst): # we have a parse: if i + 1 in conll_idx.fileids: inst = trees.train[c_append] c_append += 1 # we don't have a parse: else: inst = None yield inst max_sent_len = 1000000 max_nr_sent = 1000000 if file_parsed is None or files_parsed_path is None: sys.exit("Missing parsed file.") contents = open(file, encoding="iso-8859-1") nr_sent = 0 instances = [] ex_x = [] ex_x_pos = [] ex_y = [] docstarts = set() # track docstarts header for line in contents: if line.startswith("-DOCSTART"): docstarts.add(nr_sent) continue toks = line.split() if len(toks) < 3: if 0 < len(ex_x) < max_sent_len: # len(ex_x) > 1 # escape one-word sentences nr_sent += 1 instance = [ex_x, ex_x_pos, ex_y] instances.append(instance) if nr_sent >= max_nr_sent: break ex_x = [] ex_x_pos = [] ex_y = [] else: tag = toks[2] postag = toks[1] word = toks[0] ex_x.append(word) ex_x_pos.append(postag) ex_y.append(tag) for c, instance in enumerate(get_tree(len(instances))): ex_z = self.get_words(instance, self.tree_vocab) # should get lemmas (from ConllCorpus) if ex_z is None: inst = [i for i in instances[c][0]] print("None instance") else: assert len(ex_z) == len(instances[c][0]) inst = ex_z instances[c].append(inst) return docstarts, instances # try generator def normalize_word(self, word): if word not in self.wordrep_dict: return "*unk*" if word.lower() not in self.wordrep_dict else word.lower() else: return word def normalize_tree(self, tree, trees_vocab, c): """ Recode the name index based on wordrep_dict. Modify tree.name such that *unk* or lowercase words are included. """ for node in tree: w = trees_vocab.get_label_name(node.name) # if c==0: # print("{}\t{}".format(w, self.normalize_word(w))) new_name = self.wordrep_dict.get_label_id(self.normalize_word(w)) node.set_name(new_name) return tree def get_words(self, instance, vocab): if isinstance(instance, Tree): return get_words_from_tree(instance, vocab) print("None instance in Conll2002NerCorpus") return None def write_conll_instances(self, gold, predictions, file, sep=" "): """ Create dataset with appended predictions as the last column. """ assert len(gold) == len(predictions) contents = open(file, "w", encoding="iso-8859-1") for gold_seq, pred_seq in zip(gold.seq_list, predictions): for x, y, y_hat in zip(gold_seq.x, gold_seq.y, pred_seq.y): contents.write("{}{sep}{}{sep}{}\n".format(gold_seq.sequence_list.x_dict.get_label_name(x), gold_seq.sequence_list.y_dict.get_label_name(y), pred_seq.sequence_list.y_dict.get_label_name(y_hat), sep=sep)) contents.write("\n") # # Dumps a corpus into a file def save_corpus(self, dirname): if not os.path.isdir(dirname + "/"): os.mkdir(dirname + "/") #word_fn = open(dir+"word.dic","w") #for word_id,word in enumerate(self.int_to_word): # word_fn.write("{}\t{}\n".format(word_id, word)) #word_fn.close() #tag_fn = open(dir+"tag.dic","w") #for tag_id,tag in enumerate(self.int_to_tag): # tag_fn.write("{}\t{}\n".format(tag_id, tag)) #tag_fn.close() #word_count_fn = open(dir+"word.count","w") #for word_id,counts in self.word_counts.iteritems(): # word_count_fn.write("{}\t{}\n".format(word_id,counts)) #word_count_fn.close() self.sequence_list.save(dirname + "sequence_list") ## Loads a corpus from a file def load_corpus(self, dirname): word_fn = open(dirname + "word.dic") for line in word_fn: word_nr, word = line.strip().split("\t") self.int_to_word.append(word) self.word_dict[word] = int(word_nr) word_fn.close() tag_fn = open(dirname + "tag.dic") for line in tag_fn: tag_nr, tag = line.strip().split("\t") if tag not in self.tag_dict: self.int_to_tag.append(tag) self.tag_dict[tag] = int(tag_nr) tag_fn.close() word_count_fn = open(dirname + "word.count") for line in word_count_fn: word_nr, word_count = line.strip().split("\t") self.word_counts[int(word_nr)] = int(word_count) word_count_fn.close() self.sequence_list.load(dirname + "sequence_list")
def __init__(self, wordrep_dict=None): self.word_dict = LabelDictionary() self.tag_dict = LabelDictionary() self.sequence_list = None