def __get_instances_from_file(self, file_name): """ helper function to convert input file to lists of lists holding input words|tags """ data = [(words, tags) for (words, tags) in list(read_conll_file(file_name))] words = [words for (words, _) in data] tags = [tags for (_, tags) in data] return words, tags
def make_data(self, file_name, w2i=None, t2i=None, freeze=False): """ transform data to features (map word to indices, w2i); reserve index 0 for PADDING, 1 for UNK map tags to indices (t2i) [in Keras labels need to be integers] :freeze: False = test data (do not add new words) """ if not w2i: w2i = {"<pad>": 0, "_UNK": 1} t2i = {} X = [] Y = [] X_org = [] # keep original words for type-constr. num_sentences = 0 num_tokens = 0 for instance_idx, (words, tags) in enumerate(read_conll_file(file_name)): num_sentences += 1 instance_feats_indices = [] # sequence of word indices instance_tags_indices = [] # sequence of tag indices for i, (word, tag) in enumerate(zip(words, tags)): num_tokens += 1 # map words and tags to indices if word not in w2i: if not freeze: w2i[word] = len(w2i) instance_feats_indices.append(w2i[word]) else: # set to UNK instance_feats_indices.append(w2i["_UNK"]) else: instance_feats_indices.append(w2i[word]) if not freeze: if tag not in t2i: t2i[tag] = len(t2i) #+1 #start from 1 (reserve 0 for padding!) instance_tags_indices.append(t2i.get(tag)) X.append(instance_feats_indices) Y.append(instance_tags_indices) X_org.append(words) # reading train data if not freeze: i2t = {id: tag for tag, id in t2i.items()} print("%s sentences %s tokens" % (num_sentences, num_tokens)) print("%s features" % len(w2i)) assert (len(X) == len(Y)) # make sure lengths match if not freeze: return X, Y, w2i, t2i # return token/tag indices else: return X, Y, X_org
def make_data(self, file_name, w2i=None, t2i=None, freeze=False): """ transform data to features (map word to indices, w2i); reserve index 0 for PADDING, 1 for UNK map tags to indices (t2i) [in Keras labels need to be integers] :freeze: False = test data (do not add new words) """ if not w2i: w2i = {"<pad>": 0, "_UNK": 1} #t2i = {"<padtag>": 0} # get rid of padtag and use masks! t2i = {} X = [] Y = [] X_org = [] # keep original words for type-constr. num_sentences = 0 num_tokens = 0 for instance_idx, (words, tags) in enumerate(read_conll_file(file_name)): num_sentences += 1 instance_feats_indices = [] # sequence of word indices instance_tags_indices = [] # sequence of tag indices for i, (word, tag) in enumerate(zip(words, tags)): num_tokens += 1 # map words and tags to indices if word not in w2i: if not freeze: w2i[word] = len(w2i) instance_feats_indices.append(w2i[word]) else: # set to UNK instance_feats_indices.append(w2i["_UNK"]) else: instance_feats_indices.append(w2i[word]) if not freeze: if tag not in t2i: t2i[tag] = len(t2i) #+1 #start from 1 (reserve 0 for padding!) instance_tags_indices.append(t2i.get(tag)) X.append(instance_feats_indices) Y.append(instance_tags_indices) X_org.append(words) if not freeze: # when reading train data i2t = {id: tag for tag, id in t2i.items()} print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr) print("%s features" % len(w2i), file=sys.stderr) assert (len(X) == len(Y)) # make sure lengths match if not freeze: return X, Y, w2i, t2i # return token/tag indices else: return X, Y, X_org
def get_data_as_indices(self, file_name): """ X = list of (word_indices, word_char_indices) Y = list of tag indices """ X, Y = [],[] org_X, org_Y = [], [] for (words, tags) in read_conll_file(file_name): word_indices, word_char_indices = self.get_features(words) tag_indices = [self.tag2idx.get(tag) for tag in tags] X.append((word_indices,word_char_indices)) Y.append(tag_indices) org_X.append(words) org_Y.append(tags) return X, Y #, org_X, org_Y - for now don't use
def get_data_as_indices(self, folder_name, task): """ X = list of (word_indices, word_char_indices) Y = list of tag indices """ X, Y = [],[] org_X, org_Y = [], [] task_labels = [] for (words, tags) in read_conll_file(folder_name): word_indices, word_char_indices = self.get_features(words) tag_indices = [self.task2tag2idx[task].get(tag) for tag in tags] X.append((word_indices,word_char_indices)) Y.append(tag_indices) org_X.append(words) org_Y.append(tags) task_labels.append( task ) return X, Y, org_X, org_Y, task_labels
def get_train_data(self, train_data): """ transform training data to features (word indices) map tags to integers """ X = [] Y = [] # word 2 indices and tag 2 indices w2i = {} # word to index c2i = {} # char to index tag2idx = {} # tag2idx w2i["_UNK"] = 0 # unk word / OOV c2i["_UNK"] = 0 # unk char c2i["<w>"] = 1 # word start c2i["</w>"] = 2 # word end index num_sentences=0 num_tokens=0 for instance_idx, (words, tags) in enumerate(read_conll_file(train_data)): instance_word_indices = [] #sequence of word indices instance_char_indices = [] #sequence of char indices instance_tags_indices = [] #sequence of tag indices for i, (word, tag) in enumerate(zip(words, tags)): # map words and tags to indices if word not in w2i: w2i[word] = len(w2i) instance_word_indices.append(w2i[word]) if self.c_in_dim > 0: chars_of_word = [c2i["<w>"]] for char in word: if char not in c2i: c2i[char] = len(c2i) chars_of_word.append(c2i[char]) chars_of_word.append(c2i["</w>"]) instance_char_indices.append(chars_of_word) if tag not in tag2idx: tag2idx[tag]=len(tag2idx) instance_tags_indices.append(tag2idx.get(tag)) num_tokens+=1 num_sentences+=1 X.append((instance_word_indices, instance_char_indices)) # list of word indices, for every word list of char indices Y.append(instance_tags_indices) print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr) print("%s w features, %s c features " % (len(w2i),len(c2i)), file=sys.stderr) if self.c_in_dim == 0: print("char features disabled", file=sys.stderr) assert(len(X)==len(Y)) # store mappings of words and tags to indices self.set_indices(w2i, c2i, tag2idx) return X, Y
def get_train_data(self, list_folders_name): """ :param list_folders_name: list of folders names :param lower: whether to lowercase tokens transform training data to features (word indices) map tags to integers """ X = [] Y = [] task_labels = [] #keeps track of where instances come from "task1" or "task2".. self.tasks_ids = [] #record the id of the tasks #num_sentences=0 #num_tokens=0 # word 2 indices and tag 2 indices w2i = {} # word to index c2i = {} # char to index task2tag2idx = {} # id of the task -> tag2idx w2i["_UNK"] = 0 # unk word / OOV c2i["_UNK"] = 0 # unk char c2i["<w>"] = 1 # word start c2i["</w>"] = 2 # word end index for i, folder_name in enumerate( list_folders_name ): num_sentences=0 num_tokens=0 task_id = 'task'+str(i) self.tasks_ids.append( task_id ) if task_id not in task2tag2idx: task2tag2idx[task_id] = {} for instance_idx, (words, tags) in enumerate(read_conll_file(folder_name)): num_sentences += 1 instance_word_indices = [] #sequence of word indices instance_char_indices = [] #sequence of char indices instance_tags_indices = [] #sequence of tag indices for i, (word, tag) in enumerate(zip(words, tags)): num_tokens += 1 # map words and tags to indices if word not in w2i: w2i[word] = len(w2i) instance_word_indices.append(w2i[word]) chars_of_word = [c2i["<w>"]] for char in word: if char not in c2i: c2i[char] = len(c2i) chars_of_word.append(c2i[char]) chars_of_word.append(c2i["</w>"]) instance_char_indices.append(chars_of_word) if tag not in task2tag2idx[task_id]: #tag2idx[tag]=len(tag2idx) task2tag2idx[task_id][tag]=len(task2tag2idx[task_id]) instance_tags_indices.append(task2tag2idx[task_id].get(tag)) X.append((instance_word_indices, instance_char_indices)) # list of word indices, for every word list of char indices Y.append(instance_tags_indices) task_labels.append(task_id) #self.num_labels[task_id] = len( task2tag2idx[task_id] ) if num_sentences == 0 or num_tokens == 0: sys.exit( "No data read from: "+folder_name ) print("TASK "+task_id+" "+folder_name, file=sys.stderr ) print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr) print("%s w features, %s c features " % (len(w2i),len(c2i)), file=sys.stderr) assert(len(X)==len(Y)) return X, Y, task_labels, w2i, c2i, task2tag2idx #sequence of features, sequence of labels, necessary mappings
vocab = Vocab(vocabfile) if "embeds" in config: tagger = SimpleBiltyTagger( config.in_dim, config.h_dim, config.c_in_dim, config.h_layers, embeds_file=config.embeds, word2id=vocab.word2id, ) else: tagger = SimpleBiltyTagger(config.in_dim, config.h_dim, config.c_in_dim, config.h_layers, embeds_file=None, word2id=vocab.word2id) tagger = load_tagger(model) test_X, test_Y = tagger.get_data_as_indices(testfile) correct, total = tagger.evaluate(test_X, test_Y) print("accuracy", correct / total) dev_test_labels = [] for _, tags in read_conll_file(testfile): dev_test_labels.append(tags) tagger.get_predictions_output(test_X, dev_test_labels, "dev.xxx.out")
def get_train_data(self, list_folders_name): """ Get train data: read each train set (linked to a task) :param list_folders_name: list of folders names transform training data to features (word indices) map tags to integers """ X = [] Y = [] task_labels = [] # keeps track of where instances come from "task1" or "task2".. self.tasks_ids = [] # record ids of the tasks # word 2 indices and tag 2 indices w2i = {} # word to index c2i = {} # char to index task2tag2idx = {} # id of the task -> tag2idx w2i[UNK] = 0 # unk word / OOV c2i[UNK] = 0 # unk char c2i["<w>"] = 1 # word start c2i["</w>"] = 2 # word end index if self.max_vocab_size is not None: word_counter = Counter() print('Reading files to create vocabulary of size %d.' % self.max_vocab_size) for i, folder_name in enumerate(list_folders_name): for words, _ in read_conll_file(folder_name): word_counter.update(words) word_count_pairs = word_counter.most_common(self.max_vocab_size-1) for word, _ in word_count_pairs: w2i[word] = len(w2i) for i, folder_name in enumerate(list_folders_name): num_sentences=0 num_tokens=0 task_id = 'task'+str(i) self.tasks_ids.append( task_id ) if task_id not in task2tag2idx: task2tag2idx[task_id] = {} for instance_idx, (words, tags) in enumerate(read_conll_file(folder_name)): num_sentences += 1 instance_word_indices = [] #sequence of word indices instance_char_indices = [] #sequence of char indices instance_tags_indices = [] #sequence of tag indices for i, (word, tag) in enumerate(zip(words, tags)): num_tokens += 1 # map words and tags to indices if word not in w2i and self.max_vocab_size is not None: # if word is not in the created vocab, add an UNK token instance_word_indices.append(w2i[UNK]) else: if word not in w2i: w2i[word] = len(w2i) instance_word_indices.append(w2i[word]) if self.c_in_dim > 0: chars_of_word = [c2i["<w>"]] for char in word: if char not in c2i: c2i[char] = len(c2i) chars_of_word.append(c2i[char]) chars_of_word.append(c2i["</w>"]) instance_char_indices.append(chars_of_word) if tag not in task2tag2idx[task_id]: task2tag2idx[task_id][tag]=len(task2tag2idx[task_id]) instance_tags_indices.append(task2tag2idx[task_id].get(tag)) X.append((instance_word_indices, instance_char_indices)) # list of word indices, for every word list of char indices Y.append(instance_tags_indices) task_labels.append(task_id) if num_sentences == 0 or num_tokens == 0: sys.exit( "No data read from: "+folder_name ) print("TASK "+task_id+" "+folder_name, file=sys.stderr ) print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr) print("%s w features, %s c features " % (len(w2i),len(c2i)), file=sys.stderr) assert(len(X)==len(Y)) return X, Y, task_labels, w2i, c2i, task2tag2idx #sequence of features, sequence of labels, necessary mappings